!pip install fastquant -qq
!pip install yfinance -qq
!pip install vaderSentiment -qq
!pip install tweet-preprocessor -qq
!pip install pyLDAvis -qq
!pip install gensim -qq
!pip install --upgrade smart_open -qq
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import matplotlib.dates as dates
import datetime as dt
from datetime import timedelta
import warnings
from wordcloud import WordCloud, STOPWORDS
from gensim.models import LdaMulticore
import pyLDAvis.gensim
%matplotlib inline
pyLDAvis.enable_notebook()
from gensim.corpora.dictionary import Dictionary
from networkx.algorithms import bipartite
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora.dictionary import Dictionary

 
import yfinance as yf
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 

import preprocessor as p # Cleaner for tweet data.
import spacy
nlp=spacy.load('en_core_web_sm')
spacy_stopwords = nlp.Defaults.stop_words
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
stop_words = spacy_stopwords
tknzr = TweetTokenizer()
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import itertools
from collections import Counter 
from fastquant import backtest

import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.layers import Bidirectional
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import TfidfVectorizer

import re
import networkx as nx
import community as community_louvain
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.plotting import show
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

</img> </img>

Tesla stock data

The Tesla stock data is imported by use of the package Yfinance, which imports data directly from Yahoo finance. This package allows for import of stock prices by day, but also all the way down to individual minutes or hours of a day. We choose to go by days, and not a more granular level like hours or minutes of days, as we need to be able to match the tweets of a day to the stock prices, and going down to hours or minutes makes this matching much more difficult, and may become a lot more arbitrary, as the hour a tweet is matched to may not be the hour that the tweet actually has an impact on the stock price. Therefore we go by day, in order to reduce the chance of arbitrary matching of stock prices and tweets, although this is still something we need to take into account.

tesla_tick = yf.Ticker('TSLA')
tesla_tick_real = yf.Ticker('TSLA')

tesla = tesla_tick.history(interval='1d', start='2020-01-01', end='2020-12-19')
tesla_now = tesla_tick_real.history(interval='1d', start='2020-12-18', end='2021-01-02')
tesla
Open High Low Close Volume Dividends Stock Splits
Date
2020-01-02 84.90 86.14 84.34 86.05 47660500 0 0.0
2020-01-03 88.10 90.80 87.38 88.60 88892500 0 0.0
2020-01-06 88.09 90.31 88.00 90.31 50665000 0 0.0
2020-01-07 92.28 94.33 90.67 93.81 89410500 0 0.0
2020-01-08 94.74 99.70 93.65 98.43 155721500 0 0.0
... ... ... ... ... ... ... ...
2020-12-14 619.00 642.75 610.20 639.83 52040600 0 0.0
2020-12-15 643.28 646.90 623.80 633.25 45223600 0 0.0
2020-12-16 628.23 632.50 605.00 622.77 42095800 0 0.0
2020-12-17 628.19 658.82 619.50 655.90 56270100 0 0.0
2020-12-18 668.90 695.00 628.54 695.00 222126200 0 0.0

245 rows × 7 columns

Twitter data

We collect tweets by use of the the package Twint. It collects the tweets on an individual basis, and collects the newest first by the date and time it is created at. We choose to specify a minimum of likes for each tweet to have to 10, as this reduces the chance of having bots that product spamtweets, which was a problem before we introduced this limit. We do note that this limit is arbitrary in itself, as any given number above a few likes would most likely have sufficed, although a higher number does reduce the chance of bots simply liking bot-created tweets as well. The min likes does, however, also have a practical implication as twint kept crashing, as it reached a threshold of around 30.000-40.000 tweets. Without this limit we would have needed to run Twint 30-40 times over, whereas we only needed to run it twice with this limit. The likes_limit does also reflect som of the thought relating to the impact tha the tweet may have on other peers, as a minimum like of 10 (or any other number for that matter), may indicate som peer-reviewing and agreeableness among twitter-peers, which may very well have an impact on the stock price. We run this code below and save it as two csv-files locally.

#import twint
#import nest_asyncio
#nest_asyncio.apply()
#c = twint.Config()
#c.Search = "$TSLA"
#c.Lang = "en"
#c.Min_likes = 10
#c.Limit = 1000000
#c.Store_csv = True
#c.Output = "$TSLA_minlikes101.csv"

#twint.run.Search(c)

With the two csv-files saved locally in colab called $TSLA_minlikes101 and 102. These dataframes are hereafter appended to each other and thereby a single dataframe is created. This is uploaded to Github:

#We don't run this code, as this was done on the 18th of december,
#and does not need to be replicated.

#df_1 = pd.read_csv('$TSLA_minlikes101.csv')
#df_2 = pd.read_csv('$TSLA_minlikes102.csv')

#change the date to datetime
#df_1['date'] = pd.to_datetime(df_1['date'])  
#df_2['date'] = pd.to_datetime(df_2['date']) 

#Create filering "masks", in order to ensure that the dataframes don't have
#overlapping dates:
#mask1 = (df_1['date'] > '2020-07-14') & (df_1['date'] <= '2020-12-18')
#mask2 = (df_2['date'] >= '2020-01-01') & (df_2['date'] <= '2020-07-14')

#apply each mask
#df_1 = df_1.loc[mask1]
#df_2 = df_2.loc[mask2]

#append the dataframes
#df = df_1.append(df_2)

#reset the index
#df = df.reset_index()
#df.drop(columns=['index'])

#The dataframe was hereafter exported and uploaded to github.
df = pd.read_csv('https://github.com/JacobBaade/Tesla/blob/main/$TSLA_1_year.zip?raw=true', compression='zip')

Initial look at the dataframe containing tweets:

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76169 entries, 0 to 76168
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       76169 non-null  object 
 1   index            76169 non-null  object 
 2   id               76169 non-null  object 
 3   conversation_id  76169 non-null  object 
 4   created_at       76169 non-null  object 
 5   date             76169 non-null  object 
 6   time             76169 non-null  object 
 7   timezone         76169 non-null  float64
 8   user_id          76169 non-null  object 
 9   username         76169 non-null  object 
 10  name             76167 non-null  object 
 11  place            1 non-null      object 
 12  tweet            76168 non-null  object 
 13  language         76168 non-null  object 
 14  mentions         76167 non-null  object 
 15  urls             76167 non-null  object 
 16  photos           76167 non-null  object 
 17  replies_count    76167 non-null  float64
 18  retweets_count   76167 non-null  float64
 19  likes_count      76167 non-null  float64
 20  hashtags         76167 non-null  object 
 21  cashtags         76168 non-null  object 
 22  link             76167 non-null  object 
 23  retweet          76167 non-null  object 
 24  quote_url        13511 non-null  object 
 25  video            76167 non-null  float64
 26  thumbnail        25064 non-null  object 
 27  near             0 non-null      float64
 28  geo              0 non-null      float64
 29  source           0 non-null      float64
 30  user_rt_id       0 non-null      float64
 31  user_rt          0 non-null      float64
 32  retweet_id       0 non-null      float64
 33  reply_to         76167 non-null  object 
 34  retweet_date     0 non-null      float64
 35  translate        0 non-null      float64
 36  trans_src        0 non-null      float64
 37  trans_dest       0 non-null      float64
dtypes: float64(15), object(23)
memory usage: 22.1+ MB

We can see that a lot of the columns are either complete empty or missing a lot of values.

df.head()
Unnamed: 0 index id conversation_id created_at date time timezone user_id username name place tweet language mentions urls photos replies_count retweets_count likes_count hashtags cashtags link retweet quote_url video thumbnail near geo source user_rt_id user_rt retweet_id reply_to retweet_date translate trans_src trans_dest
0 0 0 1339879625783046144 1.33988e+18 2020-12-18 10:26:11 UTC 2020-12-18 10:26:11 0.0 9.85244e+17 teslaconomics Teslaconomics NaN REMEMBER: Today is one of the most critical da... en [] [] [] 3.0 0.0 9.0 ['tesla'] ['tsla'] https://twitter.com/Teslaconomics/status/13398... False NaN 0.0 NaN NaN NaN NaN NaN NaN NaN [] NaN NaN NaN NaN
1 1 1 1339877034240008192 1.33988e+18 2020-12-18 10:15:53 UTC 2020-12-18 10:15:53 0.0 1.77767e+07 becomingguru Lakshman Prasad NaN I nicely set up Google Sheets that auto-update... en [] ['http://hodlreturns.com/'] ['https://pbs.twimg.com/media/Epgysl1U0AIEIDm.... 3.0 3.0 16.0 ['btc'] ['tsla', 'zm'] https://twitter.com/becomingGuru/status/133987... False NaN 1.0 https://pbs.twimg.com/media/Epgysl1U0AIEIDm.jpg NaN NaN NaN NaN NaN NaN [] NaN NaN NaN NaN
2 2 2 1339874504626081796 1.33987e+18 2020-12-18 10:05:50 UTC 2020-12-18 10:05:50 0.0 3.02045e+08 alpsoy66 Alp NaN The key question is, will there be liquidity s... en [] [] [] 3.0 0.0 12.0 [] ['tsla'] https://twitter.com/Alpsoy66/status/1339874504... False NaN 0.0 NaN NaN NaN NaN NaN NaN NaN [] NaN NaN NaN NaN
3 3 3 1339868092252315648 1.33987e+18 2020-12-18 09:40:21 UTC 2020-12-18 09:40:21 0.0 1.30009e+18 o00o_investment ぶたのはな🐽30代育児投資家 NaN 今週もお疲れ様でした😊 🇯🇵日本株は日経⤵️TOPIX・マザーズ⤴️のマチマチでしたね🌀 ... ja [] [] [] 5.0 0.0 31.0 [] ['tsla'] https://twitter.com/o00o_investment/status/133... False NaN 0.0 NaN NaN NaN NaN NaN NaN NaN [] NaN NaN NaN NaN
4 4 4 1339867526994378754 1.33987e+18 2020-12-18 09:38:06 UTC 2020-12-18 09:38:06 0.0 1.06005e+18 eliburton_ Eli NaN Tesla is more than a stock $tsla https://t.co... en [] [] ['https://pbs.twimg.com/tweet_video_thumb/Epgq... 0.0 0.0 15.0 [] ['tsla'] https://twitter.com/EliBurton_/status/13398675... False NaN 1.0 https://pbs.twimg.com/tweet_video_thumb/EpgqaW... NaN NaN NaN NaN NaN NaN [] NaN NaN NaN NaN

After creating our dataframe, we need to filter the tweets by language, as the package we use for sentiment classification later on, can only recognize english text. Twint should have been able to filter based on language, but the implementation of this in Twint did not work when we collected the data.

Furthermore we enforce the minimum likes count as a few tweets with less than 10 slipped by the filter we applied in Twint.

df = df[df['likes_count'] >= 10]
df = df[df.language == 'en']
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 68586 entries, 1 to 76168
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       68586 non-null  object 
 1   index            68586 non-null  object 
 2   id               68586 non-null  object 
 3   conversation_id  68586 non-null  object 
 4   created_at       68586 non-null  object 
 5   date             68586 non-null  object 
 6   time             68586 non-null  object 
 7   timezone         68586 non-null  float64
 8   user_id          68586 non-null  object 
 9   username         68586 non-null  object 
 10  name             68585 non-null  object 
 11  place            0 non-null      object 
 12  tweet            68586 non-null  object 
 13  language         68586 non-null  object 
 14  mentions         68586 non-null  object 
 15  urls             68586 non-null  object 
 16  photos           68586 non-null  object 
 17  replies_count    68586 non-null  float64
 18  retweets_count   68586 non-null  float64
 19  likes_count      68586 non-null  float64
 20  hashtags         68586 non-null  object 
 21  cashtags         68586 non-null  object 
 22  link             68586 non-null  object 
 23  retweet          68586 non-null  object 
 24  quote_url        11898 non-null  object 
 25  video            68586 non-null  float64
 26  thumbnail        21702 non-null  object 
 27  near             0 non-null      float64
 28  geo              0 non-null      float64
 29  source           0 non-null      float64
 30  user_rt_id       0 non-null      float64
 31  user_rt          0 non-null      float64
 32  retweet_id       0 non-null      float64
 33  reply_to         68586 non-null  object 
 34  retweet_date     0 non-null      float64
 35  translate        0 non-null      float64
 36  trans_src        0 non-null      float64
 37  trans_dest       0 non-null      float64
dtypes: float64(15), object(23)
memory usage: 20.4+ MB

Next step is to look into the cashtags in order to narrow the scope of cashtags that our twitter data contains. Some tweets contain many different cashtags like $appl and other stocks cashtags, which might be tweets about able that have little relation to Tesla specifically. We do a value count to see the most prominent cashtags, these show up as multiple cashtags if more than one cashtag is used.

df.cashtags.value_counts()
['tsla']                                                         45947
['tsla', 'tslaq']                                                 6880
['tslaq', 'tsla']                                                 1748
['tsla', 'tsla']                                                  1527
['aapl', 'tsla']                                                   283
                                                                 ...  
['alpp', 'aapl', 'tsla', 'orcl', 'alpp']                             1
['tsla', 'msft', 'appl', 'twtr']                                     1
['tsla', 'rkt', 'wfc', 'baba', 'ostk', 'wmt']                        1
['pton', 'sbux', 'tsla', 'aapl', 'lulu']                             1
['zm', 'net', 'crwd', 'tsla', 'fsly', 'twlo', 'amzn', 'pton']        1
Name: cashtags, Length: 7169, dtype: int64

As we can see the main part of all tweets contain one of the four cashtags with the highest value count, so we remove anything below top 4, which is done simply by removing any cashtag below 1000 occurences:

df = df[df.groupby('cashtags')['cashtags'].transform('size') > 1000]
df.cashtags.value_counts()
['tsla']             45947
['tsla', 'tslaq']     6880
['tslaq', 'tsla']     1748
['tsla', 'tsla']      1527
Name: cashtags, dtype: int64

Next step is to choose the columns that we will actually need. We use .copy() to avoid any warning from python

df = df.iloc[:, [4,5,9,12,17,18,19]].copy().reset_index(drop=True)
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56102 entries, 0 to 56101
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   created_at      56102 non-null  object        
 1   date            56102 non-null  datetime64[ns]
 2   username        56102 non-null  object        
 3   tweet           56102 non-null  object        
 4   replies_count   56102 non-null  float64       
 5   retweets_count  56102 non-null  float64       
 6   likes_count     56102 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(3)
memory usage: 3.0+ MB
df.describe()
replies_count retweets_count likes_count
count 56102.000000 56102.000000 56102.000000
mean 8.499626 9.796603 101.542351
std 33.286046 38.240599 413.977094
min 0.000000 0.000000 10.000000
25% 2.000000 1.000000 22.000000
50% 4.000000 3.000000 39.000000
75% 9.000000 9.000000 88.000000
max 5017.000000 5373.000000 46922.000000

Initial Exploratory Data Analysis

We start by doing som simple plots of the stocks prices development, as compared to the amount of tweets, replies, retweets, likes and a created columns called "traction" per day.

df['tweet_count'] = 1
df1 = df.groupby(df['date'].dt.date)['replies_count'].agg(['sum'])
#retweets
df2  = df.groupby(df['date'].dt.date)['retweets_count'].agg(['sum'])
#likes
df3  = df.groupby(df['date'].dt.date)['likes_count'].agg(['sum'])
#count
df4  = df.groupby(df['date'].dt.date)['tweet_count'].agg(['sum'])
df1.rename(columns={'sum':'replies'}, inplace=True)
df2.rename(columns={'sum':'retweets'}, inplace=True)
df3.rename(columns={'sum':'likes'}, inplace=True)
df4.rename(columns={'sum':'tweet_count'}, inplace=True)
groupby = df1.merge(df2, left_index=True, right_index=True)
groupby = groupby.merge(df3, left_index=True, right_index=True)
groupby = groupby.merge(df4, left_index=True, right_index=True)
groupby
replies retweets likes tweet_count
date
2020-01-01 586.0 984.0 7688.0 110
2020-01-02 794.0 1337.0 9823.0 158
2020-01-03 1572.0 2963.0 24469.0 249
2020-01-04 619.0 1312.0 8435.0 93
2020-01-05 399.0 638.0 5201.0 81
... ... ... ... ...
2020-12-14 2694.0 1264.0 29883.0 292
2020-12-15 2295.0 1161.0 26759.0 282
2020-12-16 2511.0 1175.0 25761.0 264
2020-12-17 3864.0 2194.0 40237.0 420
2020-12-18 439.0 227.0 4804.0 56

353 rows × 4 columns

#replies
x1=groupby.index
y1=groupby['replies']

#retweets
x2=groupby.index
y2=groupby['retweets']

#likes
x3=groupby.index
y3=groupby['likes']

#count of tweets
x4=groupby.index
y4=groupby['tweet_count']

#tesla closing price
x_t=tesla.index
y_t=tesla['Close']

#tesla Volume
x_v=tesla.index
y_v=tesla['Volume']
fig, axs = plt.subplots(6,figsize=(15,15))
axs[0].plot(x1,y1, 'tab:green')
axs[0].set_title('replies per day')
axs[1].plot(x2,y2, 'tab:orange')
axs[1].set_title('retweets per day')
axs[2].plot(x3,y3, 'tab:red')
axs[2].set_title('likes per day')
axs[3].plot(x4,y4, 'tab:purple')
axs[3].set_title('tweets per day')
axs[4].plot(x_t, y_t, 'tab:blue')
axs[4].set_title('TSLA stock price')
axs[5].plot(x_v, y_v, 'tab:blue')
axs[5].set_title('TSLA trading volume')
idx = pd.date_range('2020-01-01', '2020-12-18')
s = pd.Series(np.random.randn(len(idx)), index=idx)
plt.tight_layout()

We see clear spikes in especially replies, likes and tweets per day, when the stockprice is either in- or decrasing at a rapid pace. The spikes in retweets looks to be less prominent. Do keep in mind that the amount of likes and replies are hard to match to a specific day. When a tweet for instance has a 1000 likes, these may come over the following week. The same can, however, be said about the tweets themselves, even though they are easy to place in time, the effect on the stockprice can only come at a later time than the actual tweet is created at. This does mean though, that they are a better predictor of changes in teslas stock price, as compared likes, replies or retweets, which may come at any given time after the tweet is published. Therefore the count of tweets will be our focus - not because likes, replies and retweets are not just as interesting as predictors, but because we can't place them in time with the data we have collected.

Another reason for choosing the count of tweets over likes, replies and retweets is that if the classifier we use to find the sentiment of any given tweet classifies wrong, the impact of that misclassification matters much less, as compared to misclassifying a tweet with 40.000 likes. It does also mean that the specific "traction" that a highly liked, replied and/or retweeted tweet can gain, does not play any role, which does not reflect the real world, where one tweet that are highly liked will most likely have a higher impact on Tesla's stock price as compared ot an identical tweet with few likes. The introduction of a minimum of 10 likes, does ensure that a tweet at least did gain some peer-reviewing.

Sentiment analysis - VADER

To classify whether our data of tweets about $Tsla is regarded as a positive, neutral, or negative tweet we will be using VADER to perform a sentiment analysis. VADER, Valence Aware Dictionary and sEntiment Reasoner, is a rule-based sentiment analysis tool. This tool is heavenly used to perform sentiment analysis on social media data, which our data in the notebook is (Twitter data).

VADER uses a sentiment lexicon to classify whether features or words can be label as either positive or negative. VADER is pretrained using 10 independent humans to evaluate/rate each token. Each token is rated on a scale from -4 = Extremely Negative to 4 = Extremely Positive. Based on each word vader calculates a compound score, which is used to determine whether a word in it's given context is more so negative, positive or neutral.

The creators of vader recommends the following rules for the categories:

positive sentiment: compound score >= 0.05

neutral sentiment: (compound score > -0.05) and (compound score < 0.05)

negative sentiment: compound score <= -0.05

To see each word or token that VADER has been train to reconise and label the creators of VADER have made this list: https://github.com/cjhutto/vaderSentiment/blob/master/vaderSentiment/vader_lexicon.txt

Definition of positive, negative and neutral:

We define positive tweets as tweets that will contribute to a more positive view of Tesla as a company or specifically as a stock and thereby help drive the stockprice up (and not down).

We define negative tweets as tweets that will contribute to a more negative view of Tesla as a company or specifically as a stock and thereby help drive the stockprice down (and not up). (Mukhtar, 2020; Rao & Srivastava, 2012)

Neutral are tweets that will have no effect on the stock price. We do also see the neutral tweets as a bin for VADER to put all the tweets it has a hard time classifying.

sid = SentimentIntensityAnalyzer()

Here are some examples of how VADER classifies twitter data from our dataset:

a = 'Thanks Elon. Oh I just followed you so feel free to shoot me a quick Dm next time before you say $tsla stock is too high.   I’ve been daytrading to pass the time.  Not going great.   (Earmuffs SEC)'
sid.polarity_scores(a)
{'compound': 0.1985, 'neg': 0.123, 'neu': 0.736, 'pos': 0.14}

VADER says this is positive, which is kinda true, as this tweet is about Elon Musk saying the stock price is too high, although this tweet is hard to determine what the polarity score should be, as it is hard to discern how this tweet could impact the share prices, as it is a follower who talks about Elon saying the stock is too high, which might make it more neutral.

It perfectly illustrates the problem of classifying tweets, as this can be incredibly hard, even for a person to do.

b = "If you rename companies   $tsla = Earth Saver Inc #spacex = Earth Backup Inc  @elonmusk"
sid.polarity_scores(b)
{'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}

Here VADER says it is neutral, whereas this tweet demonstrates a clear bias towards a positive future ahead of Tesla. It is, however, impossible for an algorithm like VADER to take the context of the tweet into consideration (ie. earth saver being positive as we are destroying our planet), as it simply runs each word and sentence by it's lexicon.

c = "in no particular order"
sid.polarity_scores(c)
{'compound': -0.296, 'neg': 0.423, 'neu': 0.577, 'pos': 0.0}

The code above demonstrates one of the core problems with VADER: "No" makes this sentence negative.

We still choose to use VADER, as it performed the best in our initial test, where we compared it to textblob. Admittedly we did not try any manual labelling and used to train any algorithms by ourself. This might have improved our classification, although it would also have taken much more time. Further refinement could also been done by use of Prodigy, although it came at a cost of 390 dollars, as we, to our knowledge, could not access it without a license as student either.

df['scores'] = df['tweet'].apply(lambda tweet: sid.polarity_scores(tweet))
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 
                                                'pos' if c > 0.05 
                                                else ('neg' if c < -0.05 else 'neu'))

df.head()
created_at date username tweet replies_count retweets_count likes_count tweet_count scores compound comp_score
0 2020-12-18 10:05:50 UTC 2020-12-18 alpsoy66 The key question is, will there be liquidity s... 3.0 0.0 12.0 1 {'neg': 0.0, 'neu': 0.764, 'pos': 0.236, 'comp... 0.8689 pos
1 2020-12-18 09:38:06 UTC 2020-12-18 eliburton_ Tesla is more than a stock $tsla https://t.co... 0.0 0.0 15.0 1 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 neu
2 2020-12-18 09:09:46 UTC 2020-12-18 tesla_tizzler 5/ As for $TSLA, active fund manager buying wi... 1.0 0.0 19.0 1 {'neg': 0.0, 'neu': 0.83, 'pos': 0.17, 'compou... 0.5994 pos
3 2020-12-18 09:08:04 UTC 2020-12-18 tesla_tizzler 4/ If this was the case, a group of HFs could ... 1.0 1.0 23.0 1 {'neg': 0.0, 'neu': 0.93, 'pos': 0.07, 'compou... 0.5106 pos
4 2020-12-18 09:07:32 UTC 2020-12-18 tesla_tizzler 3/ The large Special Situations funds e.g. Gol... 1.0 1.0 21.0 1 {'neg': 0.0, 'neu': 0.774, 'pos': 0.226, 'comp... 0.7845 pos
df.comp_score.value_counts()
pos    26146
neu    16048
neg    13908
Name: comp_score, dtype: int64

Checking our results

In order to see how Vader performs we want to print out the tweets that it classifies as either positive, negative or neutral categorically, so we can go through the first 200-300 tweets, and see if we find any misclassifications.

j=1
sortedDF = df.sort_values(by=['comp_score'])
sortedDF.compound = sortedDF.compound.astype(str)
for i in range(0, sortedDF.shape[0])[:16]:
  if (sortedDF['comp_score'][i] == 'pos'):
    print(str(j) + ') ' + sortedDF['compound'][i] + ') '+sortedDF['tweet'][i])
    print()
    j = j+1
1) 0.8689) The key question is, will there be liquidity squeeze after  $tsla Q4 results once passive indexes pull out 130 mil and the rest of the world wants a piece after great results and 2021 prospects.. an interesting month awaiting.

2) 0.5994) 5/ As for $TSLA, active fund manager buying will remain unabated for some time, coupled with supportive strategic investors[Tencent, Baillie Gifford et al] and business fundamentals.

3) 0.5106) 4/ If this was the case, a group of HFs could buy up the entire free float of much smaller index inclusions than $TSLA,  and then sell to the indexers at any price. We don’t see that, because this is not what large HFs do.

4) 0.7845) 3/ The large Special Situations funds e.g. Goldman SSG or Davidson Kempner [$30bn each] deal in direct lending and credit arbitrage opportunities. And not index rebalancing trades such as $TSLA.

5) 0.6114) Happy $TSLA inclusion day!    https://t.co/bGKp5G53xL

6) 0.4939) What I care the most?   The production volume going forward  Soon, Tesla Giga Shanghai will surpass Fremont’s production volume.  And then $TSLA Giga Berlin will surpass Giga Shanghai’s production volume.  Next, #Tesla Giga Texas will surpass Giga Berlin’s volume.

7) 0.8271) German DER SPIEGEL magazine celebrates Elon Musk by giving Maye Musk space to wash the family history and promote Tesla's Chief Executive Swindler  https://t.co/Jq4gyhu54u  $TSLA $TSLAQ

8) 0.4779) When it comes to selling $TSLA shares right now, Tesla employees are under SEC blackout rules.   Makes for a lot less shares for sale tomorrow...  https://t.co/Wnll9narup

9) 0.6249) @kylaschwaberow @truth_tesla @pslucky13 @Tesla_Tizzler HFs bought and stockpiled $TSLA shares so they could sell them to Indexers tomorrow at a profit. If they paid $500/share and sell them for $650/share make $150/share for 1 month of effort.

10) 0.4696) Can you feel the calm? Are you nervous?😀 $TSLA

Examples of wrongly classified positive tweets:

0.7177) A Billion dollar company yet Customers, Families, People are Left to Fend for themselves w damages caused solely by ⁦@Tesla! Sorry but I don’t give an F about the stock 🤷🏻‍♀️⁩ TSLA TSLAQ #Tesla Tesla stock wrapping up best month since 2013

This message in the tweet is negative, but also contains a link to an article from CNBC calles "Tesla stock wrapping up best month since 2013", which hugely affects the score VADER gives positively. The tweet might be illintended towards Tesla, but might also further spread the knowledge of Tesla performing well, so while the auther intended is as negative, it might also be considered positive as it contains as it directly links a very positive article.

23717) 0.2684) The corona virus could be the Black Swan event which starts the deflation of the Tesla bubble. The borrowing covenants are very clear that if Tesla does not meet certain thresholds the plant is in essence repossessed by the CCP. With North America sales declining tsla tslaq

This is somehow positive and demonstrates the problems with the compound score and rule-based evaluation based on individual words (and to some extent their context). We can see the evaluation below:

c = 'The corona virus could be the Black Swan event which starts the deflation of the Tesla bubble. The borrowing covenants are very clear that if Tesla does not meet certain thresholds the plant is in essence repossessed by the CCP. With North America sales declining $tsla $tslaq'
sid.polarity_scores(c)
{'compound': 0.2684, 'neg': 0.036, 'neu': 0.905, 'pos': 0.058}

Mostly neutral, and more so negative which by VADER's compounding equates to a positive score of 0.2684. We evaluate this as a misclassification.

23745) 0.4479) By all means this is not a “nomal” short rate. Who is financing it? Who is giving the capital to hedgefunds to stay put.? Who has got such deep pockets? $tsla

This is a negative tweet, which indicates some hedgefunds have very big short positions, which at the time of this tweets writing, most likely due to the Tesla stock rising at a rapid pace, might have been very expensive in terms of losses. This indicates that VADER has a problem handling specific trading-terms like "short position".

Examples of wrongly classified negative tweets:

j=1
sortedDF = df.sort_values(by=['comp_score'])
sortedDF.compound = sortedDF.compound.astype(str)
for i in range(0, sortedDF.shape[0])[:43]:
  if (sortedDF['comp_score'][i] == 'neg'):
    print(str(j) + ') ' + sortedDF['compound'][i] + ') '+sortedDF['tweet'][i])
    print()
    j = j+1
1) -0.4404) Painful to read. I.d.3 is fast becoming a drama. $tsla

2) -0.4624) Hey @Gfilche @HyperChangeTV, just saw this video showing up in my recommended Youtube list! 🔥🔥🔥 Your audience certainly doesn't just care for $TSLA... 👙👙👙👀😂  https://t.co/TZBjEWyzfI

3) -0.7964) @JosephVVallace @28delayslater @SnoopDogg 💣 💥 A whole new level of $TSLA ⚡️ Short 🔥 Shorts 🩳 📸  https://t.co/r10XuLoGlc

4) -0.2732) $TSLA may go up OR down tomorrow. Don’t say you weren’t warned.  https://t.co/MfQrUUtiOZ

5) -0.3612) "#Tesla's S&amp;P 500 inclusion is reckless", $TSLA profit taking, possible crash on the horizon - @NewConstructs $TSLAQ  https://t.co/SHrslzP82t

6) -0.5362) So, everyone who bought Tesla stock over the last couple months - is going to sell it at the same time tomorrow?  What could go wrong?  $TSLA $TSLAQ

7) -0.7269) @truth_tesla @pslucky13 @Tesla_Tizzler 90% of the $tsla tweets on this topic are speculating about the possibility of a failed closing cross because of too much demand. There is an equal chance of a failed closing cross tomorrow because of too much supply.

8) -0.5256) @realDonaldTrump I am very disappointed in $TSLA stock price, and so is @elonmusk

9) -0.4926) I’m excited and scared for tomorrow! $TSLA 🚀😬

10) -0.4019) Hey, @DougDeMuro, you’re better than this. C’mon. Stop falling for the $tsla lies.

11420) -0.2755) Even the bulls aren't ready for 2020. TSLA #NotSellingAShareBefore10000

We regard this a positive. Comes down to the rule-based evaluation of VADER.

11494) -0.25) TSLA closes the week 3 cents shy of $800

This is simply a statement, and a positive at that - but knowing this would require the algorithm to have knowledge of the movement of the stock price (and know that increases are positive).

11498) -0.707) I think it's very unwise to sell Tesla. I mean, it's just very unwise. I think there is... there's a tsunami of hurt coming for those who sold their shares. It's going to be very, very unpleasant. I advise people to buy back while there's time. TSLA #NotSellingAShareBefore10000

Very negative, but this is towards the people who have sold the stock. It essentially says to keep the stock.

Examples of wrongly classified neutral tweets:

j=1
sortedDF = df.sort_values(by=['comp_score'])
sortedDF.compound = sortedDF.compound.astype(str)
for i in range(0, sortedDF.shape[0])[:58]:
  if (sortedDF['comp_score'][i] == 'neu'):
    print(str(j) + ') ' + sortedDF['compound'][i] + ') '+sortedDF['tweet'][i])
    print()
    j = j+1
1) 0.0) Tesla is more than a stock $tsla  https://t.co/y4yTkZ7rLs

2) 0.0) $TSLA has 18x’ed in the last 18 months!!👀

3) 0.0) Countdown to open $TSLA  https://t.co/87vr1US4m5

4) 0.0) MMs &amp; options brokers trying to manipulate $TSLA going into Friday:  https://t.co/13HTE7TcB4

5) 0.0) I predict that $TSLA dollar volume will set a global record tomorrow 🦊🦊🦊🦊🦊

6) -0.017) I would've strongly considered selling covered calls against my $TSLA with a high strike price, as any index-related surge is likely to be short lived. However, implied volatility simply hasn't enticing enough. Not worth it for me; just sitting this one out and holding.  https://t.co/uCdcOsTHHN

7) 0.0) Why Friday Is Big for Tesla Stock, Ahead of Monday’s Epic S  https://t.co/kuNRhONhDH $TSLA

8) 0.0) @MatchasmMatt @alandail @navtown @MrRobRobson The 13F $tsla holders only have to disclose at the end of each quarter.

9) 0.0) Tesla Norway is hoarding cars. Robo-taxi fleet coming in 13 days. $TSLA $TSLAQ  https://t.co/ibHImC6hWu

10) 0.0) $TSLA #TSLA This could be quite the move coming.    https://t.co/RZ5XRCBOX7

Most of the tweets in neutral are misclassified as VADER stumples on a sentence that is highly context dependent or contains unknown words, weird spacing or use of symbols. As we run through the neutral tweets, we see that a higher proportion of the tweets are positive, rather than negative, when we do our subjective evaluation of the first 300 tweets. The proportion of tweets that are postive are around 170 out of 300 and 100 are negative. 30 tweets we had trouble classifying ourselves. This proportion of positive vs. negative does somewhat correspond to the difference among positive tweets that VADER classified as compared to negative (around 26000 positive vs 14000 negative) If this is the case, we can remove all these neutral tweets, without changing the relative distribution of positive vs negative tweets. We do not wan't to keep that neutral tweets, as these tweets are clearly not neutral overall. The only way to handle these tweets proporly would be to manually label them or improve VADER, which might be very difficult as things like context-specific knowledge of where the stockprice is at the moment would have to be added to VADER's source code.

df = df[df.comp_score != 'neu'].reset_index(drop=True)

Most used words

We tokenize our tweets to be able to create word clouds as well as topics later on.

p.set_options(p.OPT.URL, p.OPT.RESERVED, p.OPT.MENTION, p.OPT.NUMBER, p.OPT.HASHTAG, p.OPT.SMILEY, p.OPT.EMOJI)
def preprocess_tweet_text(tweet):

    tweet = re.sub('TSLA','tesla', tweet)
    tweet = re.sub('tsla','tesla', tweet)
    return tweet
df['clean_tweet'] = df['tweet'].map(p.clean)
df['new_clean'] = df['clean_tweet'].apply(preprocess_tweet_text)
tokens = df['new_clean'].map(lambda row: [tok.lower() for tok in tknzr.tokenize(row) if tok not in stop_words and tok.isalpha() and len(tok) > 2])
df['tokens'] = tokens

We add len(tok) above 2, as we do not wan't words like "I", "a", "it" etc., that don't add to our topics

We lemmatize over stemming, as

def word_lemmatizer(text):
  lem_text = [lemmatizer.lemmatize(i) for i in text]
  return lem_text

df['lemmatized'] = df['tokens'].apply(lambda x: word_lemmatizer(x))
print(df.lemmatized)
0        [the, key, question, liquidity, squeeze, tesla...
1        [tesla, active, fund, manager, buying, remain,...
2        [case, group, hf, buy, entire, free, float, sm...
3        [the, large, special, situation, fund, goldman...
4                           [happy, tesla, inclusion, day]
                               ...                        
40049    [fascinating, month, frivolity, twitter, mess,...
40050    [convinced, army, noobs, soon, learn, meaning,...
40051    [review, bought, year, tesla, started, channel...
40052    [everyone, heard, accountant, joke, answer, qu...
40053    [boom, the, nhtsa, investigate, fatal, dec, te...
Name: lemmatized, Length: 40054, dtype: object

Most frequently used words (lemmatized):

counter= itertools.chain(*df['lemmatized'])
counted_tags = Counter(counter)
counted_tags.most_common()[0:50]
[('tesla', 51004),
 ('teslaq', 6924),
 ('day', 4081),
 ('stock', 4006),
 ('share', 3962),
 ('the', 3796),
 ('like', 3080),
 ('year', 2967),
 ('price', 2869),
 ('car', 2793),
 ('today', 2760),
 ('short', 2737),
 ('market', 2671),
 ('model', 2649),
 ('elon', 2604),
 ('time', 2438),
 ('this', 2322),
 ('new', 2085),
 ('amp', 2013),
 ('company', 2010),
 ('week', 1951),
 ('buy', 1877),
 ('musk', 1830),
 ('going', 1827),
 ('good', 1587),
 ('battery', 1564),
 ('think', 1541),
 ('know', 1425),
 ('people', 1360),
 ('look', 1346),
 ('sell', 1342),
 ('long', 1290),
 ('china', 1239),
 ('right', 1225),
 ('delivery', 1196),
 ('investor', 1194),
 ('month', 1192),
 ('sale', 1187),
 ('need', 1155),
 ('want', 1117),
 ('profit', 1112),
 ('and', 1111),
 ('high', 1080),
 ('money', 1070),
 ('dont', 1033),
 ('way', 1007),
 ('you', 993),
 ('thing', 962),
 ('great', 956),
 ('what', 944)]

Prominent words are tesla (changed from TSLA), teslaq (changed from TSLAQ) which is a group of people that hates Tesla, as well as short, which indicates shorting the stock.

200 most frequent words, positive tweets

df_positive = df[df['comp_score'] == 'pos']
text = df_positive["lemmatized"].to_string()
wordcloud = WordCloud(max_words=50, width = 800, height = 500, 
                background_color ='white', 
                min_font_size = 10, collocations=False).generate(text)

plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad = 0) 
plt.show()

200 most frequent words, negative tweets

df_negative = df[df['comp_score'] == 'neg']

text = df_negative["lemmatized"].to_string()
wordcloud = WordCloud(max_words = 50, width = 800, height = 500, 
                background_color ='white', 
                min_font_size = 10, collocations=False).generate(text)

plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0) 
plt.show()

Topic modelling

In order to see how the sentiment classifier VADER performs at a higher level of abstraction, topic modelling can be used. This provides an easier way of seeing how different topics relate and which topics are classified as negative as compared to positive. We use TF-IDF as a representation of words. We tried a bag-of-word representation, but this did not create as interesting topics. This is likely due to a high term frequency in individually (locally) and a low frequency across all negative or positive tweets, will result in a higher TF-IDF score, whereas the BOW will give a high weight to just frequently appearing words. When a word has a high term frequency(tf) in a given tweet (local parameter) and a low term frequency of the term in the whole collection tweets ( global parameter), it will get a higher score, and we are thereby able to see more distinct words, other than "tesla", "that" etc., that are present in smaller clusters of tweet, that might relate to discussions surrounding specific topics.

dictionary_pos = Dictionary(df_positive['lemmatized'])
dictionary_neg = Dictionary(df_negative['lemmatized'])

dictionary_pos.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
dictionary_neg.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
corpus_pos = [dictionary_pos.doc2bow(doc) for doc in df_positive['lemmatized']]
corpus_neg = [dictionary_neg.doc2bow(doc) for doc in df_negative['lemmatized']]
tfidf_pos = TfidfModel(corpus_pos)
tfidf_neg = TfidfModel(corpus_neg)

tfidf_pos = tfidf_pos[corpus_pos]
tfidf_neg = tfidf_neg[corpus_neg]

Positive topics

from gensim.models import CoherenceModel
#Creating an empty set for coherence
coherence = []
#Creating a range
I = range(1,21)
#Iterating over the range to check the coherence for each number of topics
for i in I:
    lda = LdaMulticore(tfidf_pos, id2word=dictionary_pos, num_topics=i)
    coherence_model_lda = CoherenceModel(model=lda, texts=df_positive['lemmatized'].tolist(), dictionary=dictionary_pos, coherence='c_v')
    coherence.append(coherence_model_lda.get_coherence())
#10 is the number that gives a high coherence while not being overwhelmingly high
sns.lineplot(I, coherence)
<matplotlib.axes._subplots.AxesSubplot at 0x7ffa2334f208>
lda_model_pos = LdaMulticore(tfidf_pos, id2word=dictionary_pos, num_topics=10, workers = 4, passes=1)
lda_model_pos.print_topics(-1)
[(0,
  '0.019*"share" + 0.012*"what" + 0.011*"teslaq" + 0.009*"stock" + 0.008*"price" + 0.007*"day" + 0.007*"buy" + 0.006*"earnings" + 0.006*"today" + 0.006*"amp"'),
 (1,
  '0.013*"teslaq" + 0.011*"market" + 0.010*"the" + 0.009*"car" + 0.008*"month" + 0.007*"year" + 0.007*"share" + 0.007*"day" + 0.007*"model" + 0.007*"stock"'),
 (2,
  '0.014*"good" + 0.010*"teslaq" + 0.010*"new" + 0.008*"share" + 0.008*"today" + 0.008*"lol" + 0.008*"morning" + 0.007*"day" + 0.007*"week" + 0.007*"tomorrow"'),
 (3,
  '0.010*"teslaq" + 0.008*"stock" + 0.008*"model" + 0.007*"ready" + 0.007*"year" + 0.007*"short" + 0.007*"market" + 0.006*"shorting" + 0.006*"like" + 0.006*"price"'),
 (4,
  '0.012*"secured" + 0.011*"short" + 0.009*"chart" + 0.009*"teslaq" + 0.008*"share" + 0.008*"profit" + 0.007*"car" + 0.007*"like" + 0.007*"the" + 0.007*"going"'),
 (5,
  '0.011*"today" + 0.011*"like" + 0.009*"teslaq" + 0.008*"elon" + 0.008*"share" + 0.007*"the" + 0.007*"love" + 0.007*"thank" + 0.007*"musk" + 0.006*"year"'),
 (6,
  '0.014*"short" + 0.009*"teslaq" + 0.008*"share" + 0.008*"stock" + 0.008*"interest" + 0.007*"day" + 0.006*"the" + 0.006*"elon" + 0.006*"price" + 0.006*"good"'),
 (7,
  '0.013*"day" + 0.012*"wow" + 0.010*"great" + 0.009*"teslaq" + 0.009*"today" + 0.008*"like" + 0.008*"share" + 0.008*"stock" + 0.007*"good" + 0.007*"nice"'),
 (8,
  '0.014*"look" + 0.011*"like" + 0.010*"china" + 0.008*"teslaq" + 0.007*"share" + 0.007*"model" + 0.007*"price" + 0.007*"day" + 0.007*"year" + 0.006*"interesting"'),
 (9,
  '0.011*"share" + 0.011*"added" + 0.011*"like" + 0.010*"year" + 0.010*"teslaq" + 0.008*"day" + 0.007*"short" + 0.007*"stock" + 0.007*"sold" + 0.007*"time"')]
lda_display = pyLDAvis.gensim.prepare(lda_model_pos, corpus_pos, dictionary_pos)

pyLDAvis.display(lda_display)

Overall the key tokens seem to be mostly positive. We do see words like "short", "teslaq" - which are a group of tesla haters. This might be a topic regarding the discussion that might happen between tesla investors that are bullish tesla and investors that a bearish tesla. The same goes for some of the other topics, which might be positive investors that use the cashtag "$tslaq", to tell them that they are wrong.

Negative topics

#Creating an empty set for coherence
coherence = []
#Creating a range
I = range(1,21)
#Iterating over the range to check the coherence for each number of topics
for i in I:
    lda = LdaMulticore(tfidf_neg, id2word=dictionary_neg, num_topics=i)
    coherence_model_lda = CoherenceModel(model=lda, texts=df_negative['lemmatized'].tolist(), dictionary=dictionary_neg, coherence='c_v')
    coherence.append(coherence_model_lda.get_coherence())
#8 is the number that gives a high coherence while not being overwhelmingly high
sns.lineplot(I, coherence)
<matplotlib.axes._subplots.AxesSubplot at 0x7ffa208e9ef0>
lda_model_neg = LdaMulticore(tfidf_neg, id2word=dictionary_neg, num_topics=11, workers = 4, passes=1)

lda_model_neg.print_topics(-1)
[(0,
  '0.013*"teslaq" + 0.009*"short" + 0.008*"elon" + 0.008*"day" + 0.007*"musk" + 0.007*"demand" + 0.007*"today" + 0.006*"people" + 0.006*"the" + 0.006*"stock"'),
 (1,
  '0.013*"teslaq" + 0.012*"short" + 0.008*"year" + 0.008*"price" + 0.008*"fraud" + 0.007*"car" + 0.007*"the" + 0.007*"model" + 0.007*"amp" + 0.006*"day"'),
 (2,
  '0.011*"teslaq" + 0.010*"elon" + 0.007*"short" + 0.007*"the" + 0.006*"stock" + 0.006*"market" + 0.006*"musk" + 0.006*"what" + 0.006*"day" + 0.005*"share"'),
 (3,
  '0.011*"day" + 0.008*"teslaq" + 0.008*"like" + 0.008*"stock" + 0.007*"price" + 0.007*"model" + 0.006*"wrong" + 0.006*"market" + 0.006*"fraud" + 0.006*"elon"'),
 (4,
  '0.010*"teslaq" + 0.009*"stock" + 0.008*"the" + 0.007*"musk" + 0.007*"price" + 0.006*"market" + 0.006*"day" + 0.006*"elon" + 0.006*"time" + 0.006*"this"'),
 (5,
  '0.014*"teslaq" + 0.010*"stock" + 0.008*"this" + 0.008*"today" + 0.007*"stop" + 0.007*"new" + 0.006*"the" + 0.006*"crash" + 0.006*"know" + 0.006*"day"'),
 (6,
  '0.010*"teslaq" + 0.009*"robotaxis" + 0.009*"the" + 0.008*"model" + 0.007*"elon" + 0.007*"road" + 0.007*"year" + 0.007*"company" + 0.007*"car" + 0.006*"time"'),
 (7,
  '0.010*"teslaq" + 0.008*"the" + 0.008*"day" + 0.007*"car" + 0.007*"stock" + 0.006*"model" + 0.006*"price" + 0.006*"you" + 0.005*"right" + 0.005*"know"'),
 (8,
  '0.008*"teslaq" + 0.008*"elon" + 0.006*"short" + 0.006*"day" + 0.006*"car" + 0.006*"musk" + 0.006*"stock" + 0.006*"model" + 0.005*"the" + 0.005*"delivery"'),
 (9,
  '0.012*"teslaq" + 0.009*"day" + 0.008*"the" + 0.008*"short" + 0.008*"lost" + 0.007*"time" + 0.006*"market" + 0.006*"battery" + 0.006*"just" + 0.005*"today"'),
 (10,
  '0.010*"going" + 0.009*"teslaq" + 0.007*"today" + 0.006*"share" + 0.006*"year" + 0.006*"stop" + 0.006*"stock" + 0.006*"car" + 0.005*"this" + 0.005*"the"')]
lda_display = pyLDAvis.gensim.prepare(lda_model_neg, corpus_neg, dictionary_neg)

pyLDAvis.display(lda_display)

We see more negative words like "fraud", "shorted", "lose", and an overall more negative vibe. This tells us that our sentiment classifier VADER has at least had some success classifying the tweets in a way, that did manage to create different key tokens across the positive and negative topics.

Adding timedelta to tweets and grouping by day

Because Nasdaq closes 21:00 PM UTC, it is likely that tweets published after this time, will only be able to influence the stock price the next day, so in order to compensate for this, we add a timedelta to the tweets of 3 hours, so when NASDAQ closes, the date changes. This is a very simple and basic way, and assumes all trading days have the usual opening hours, but it does solve most of this issue of matching tweets to days.

df['created_at'] = pd.to_datetime(df.created_at)
#we correct this now, by creating a count for positive and negative tweets in the 
#original dataframe.
df.loc[df['comp_score'] =='pos', 'tweet_count_positive'] = 1

df.loc[df['comp_score'] =='neg', 'tweet_count_negative'] = -1
df_grouped_h = df.groupby(pd.Grouper(key='created_at',freq='30min')).sum()
df_grouped_h['date'] = df_grouped_h.index
df_grouped_h['date'] = df_grouped_h['date'] + timedelta(hours=3)
df_grouped_h.iloc[16870:16900]
replies_count retweets_count likes_count tweet_count compound tweet_count_positive tweet_count_negative date
created_at
2020-12-17 11:30:00+00:00 17.0 12.0 160.0 3 1.4616 2.0 -1.0 2020-12-17 14:30:00+00:00
2020-12-17 12:00:00+00:00 16.0 2.0 146.0 4 -0.6964 2.0 -2.0 2020-12-17 15:00:00+00:00
2020-12-17 12:30:00+00:00 10.0 23.0 154.0 3 0.7911 2.0 -1.0 2020-12-17 15:30:00+00:00
2020-12-17 13:00:00+00:00 23.0 22.0 325.0 5 2.4385 5.0 0.0 2020-12-17 16:00:00+00:00
2020-12-17 13:30:00+00:00 10.0 17.0 344.0 5 1.2881 3.0 -2.0 2020-12-17 16:30:00+00:00
2020-12-17 14:00:00+00:00 12.0 3.0 135.0 4 -0.1862 2.0 -2.0 2020-12-17 17:00:00+00:00
2020-12-17 14:30:00+00:00 48.0 23.0 572.0 6 2.9317 5.0 -1.0 2020-12-17 17:30:00+00:00
2020-12-17 15:00:00+00:00 92.0 34.0 800.0 11 1.9181 8.0 -3.0 2020-12-17 18:00:00+00:00
2020-12-17 15:30:00+00:00 58.0 15.0 443.0 8 2.9841 6.0 -2.0 2020-12-17 18:30:00+00:00
2020-12-17 16:00:00+00:00 84.0 33.0 346.0 5 0.0813 3.0 -2.0 2020-12-17 19:00:00+00:00
2020-12-17 16:30:00+00:00 91.0 5.0 535.0 6 1.4239 5.0 -1.0 2020-12-17 19:30:00+00:00
2020-12-17 17:00:00+00:00 289.0 250.0 2820.0 16 2.1175 12.0 -4.0 2020-12-17 20:00:00+00:00
2020-12-17 17:30:00+00:00 62.0 24.0 653.0 6 0.7088 3.0 -3.0 2020-12-17 20:30:00+00:00
2020-12-17 18:00:00+00:00 120.0 94.0 1534.0 18 2.6515 12.0 -6.0 2020-12-17 21:00:00+00:00
2020-12-17 18:30:00+00:00 60.0 12.0 164.0 7 -0.0969 3.0 -4.0 2020-12-17 21:30:00+00:00
2020-12-17 19:00:00+00:00 25.0 31.0 318.0 8 3.5111 7.0 -1.0 2020-12-17 22:00:00+00:00
2020-12-17 19:30:00+00:00 113.0 301.0 2928.0 15 8.0697 15.0 0.0 2020-12-17 22:30:00+00:00
2020-12-17 20:00:00+00:00 67.0 36.0 749.0 10 -1.3144 4.0 -6.0 2020-12-17 23:00:00+00:00
2020-12-17 20:30:00+00:00 170.0 157.0 2921.0 27 9.9324 21.0 -6.0 2020-12-17 23:30:00+00:00
2020-12-17 21:00:00+00:00 96.0 101.0 2148.0 17 3.4683 11.0 -6.0 2020-12-18 00:00:00+00:00
2020-12-17 21:30:00+00:00 21.0 11.0 222.0 5 1.3287 4.0 -1.0 2020-12-18 00:30:00+00:00
2020-12-17 22:00:00+00:00 140.0 15.0 794.0 10 -1.3845 4.0 -6.0 2020-12-18 01:00:00+00:00
2020-12-17 22:30:00+00:00 119.0 47.0 777.0 12 2.1478 8.0 -4.0 2020-12-18 01:30:00+00:00
2020-12-17 23:00:00+00:00 67.0 21.0 285.0 5 0.1430 3.0 -2.0 2020-12-18 02:00:00+00:00
2020-12-17 23:30:00+00:00 72.0 18.0 280.0 2 1.2966 2.0 0.0 2020-12-18 02:30:00+00:00
2020-12-18 00:00:00+00:00 33.0 4.0 73.0 3 -0.1686 2.0 -1.0 2020-12-18 03:00:00+00:00
2020-12-18 00:30:00+00:00 1.0 2.0 27.0 2 0.8200 2.0 0.0 2020-12-18 03:30:00+00:00
2020-12-18 01:00:00+00:00 71.0 42.0 735.0 5 3.1012 5.0 0.0 2020-12-18 04:00:00+00:00
2020-12-18 01:30:00+00:00 97.0 81.0 1469.0 3 1.2224 2.0 -1.0 2020-12-18 04:30:00+00:00
2020-12-18 02:00:00+00:00 2.0 2.0 30.0 1 0.6249 1.0 0.0 2020-12-18 05:00:00+00:00
df_day = df_grouped_h.groupby(pd.Grouper(key='date',freq='1d')).sum()
df_day
replies_count retweets_count likes_count tweet_count compound tweet_count_positive tweet_count_negative
date
2020-01-01 00:00:00+00:00 414.0 653.0 5003.0 76 2.9702 42.0 -34.0
2020-01-02 00:00:00+00:00 454.0 821.0 6273.0 108 14.8880 68.0 -40.0
2020-01-03 00:00:00+00:00 1180.0 2209.0 18289.0 173 36.6589 118.0 -55.0
2020-01-04 00:00:00+00:00 325.0 571.0 3695.0 65 14.3277 42.0 -23.0
2020-01-05 00:00:00+00:00 386.0 959.0 6996.0 63 10.4158 40.0 -23.0
... ... ... ... ... ... ... ...
2020-12-14 00:00:00+00:00 1639.0 766.0 19232.0 182 45.8080 128.0 -54.0
2020-12-15 00:00:00+00:00 1853.0 1016.0 23874.0 221 44.8529 148.0 -73.0
2020-12-16 00:00:00+00:00 2013.0 1031.0 22205.0 192 37.7729 125.0 -67.0
2020-12-17 00:00:00+00:00 1773.0 1383.0 22098.0 233 62.5091 169.0 -64.0
2020-12-18 00:00:00+00:00 925.0 428.0 9020.0 98 21.8282 67.0 -31.0

353 rows × 7 columns

Merging stock data and tweet data

As we need a combined dataframe for backtesting purposes, as well as running a neural net, the dataframe Tesla needs to be merged to df_day. Because twitter is open 24/7 and NASDAQ has specific opening hours, we need to handle the days, which we miss stockdata as well, such as saturdays and sundays.

tesla
Open High Low Close Volume Dividends Stock Splits
Date
2020-01-02 84.90 86.14 84.34 86.05 47660500 0 0.0
2020-01-03 88.10 90.80 87.38 88.60 88892500 0 0.0
2020-01-06 88.09 90.31 88.00 90.31 50665000 0 0.0
2020-01-07 92.28 94.33 90.67 93.81 89410500 0 0.0
2020-01-08 94.74 99.70 93.65 98.43 155721500 0 0.0
... ... ... ... ... ... ... ...
2020-12-14 619.00 642.75 610.20 639.83 52040600 0 0.0
2020-12-15 643.28 646.90 623.80 633.25 45223600 0 0.0
2020-12-16 628.23 632.50 605.00 622.77 42095800 0 0.0
2020-12-17 628.19 658.82 619.50 655.90 56270100 0 0.0
2020-12-18 668.90 695.00 628.54 695.00 222126200 0 0.0

245 rows × 7 columns

df_day['date'] = df_day.index
df_day['date'] = df_day['date'].astype('datetime64[ns]')
df_day = df_day.set_index('date')
df_combined = df_day.merge(tesla, how='outer', left_index=True, right_index=True)
df_combined = df_combined.fillna(method='ffill')
df_combined.dropna(inplace=True)
df_combined.rename(columns={'Close': 'close'}, inplace=True)
df_combined['weekday'] = ((pd.DatetimeIndex(df_combined.index).dayofweek) // 5 == 1).astype(float)
df_combined
replies_count retweets_count likes_count tweet_count compound tweet_count_positive tweet_count_negative Open High Low close Volume Dividends Stock Splits weekday
2020-01-02 454.0 821.0 6273.0 108 14.8880 68.0 -40.0 84.90 86.14 84.34 86.05 47660500.0 0.0 0.0 0.0
2020-01-03 1180.0 2209.0 18289.0 173 36.6589 118.0 -55.0 88.10 90.80 87.38 88.60 88892500.0 0.0 0.0 0.0
2020-01-04 325.0 571.0 3695.0 65 14.3277 42.0 -23.0 88.10 90.80 87.38 88.60 88892500.0 0.0 0.0 1.0
2020-01-05 386.0 959.0 6996.0 63 10.4158 40.0 -23.0 88.10 90.80 87.38 88.60 88892500.0 0.0 0.0 1.0
2020-01-06 390.0 665.0 5384.0 81 10.9749 47.0 -34.0 88.09 90.31 88.00 90.31 50665000.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2020-12-14 1639.0 766.0 19232.0 182 45.8080 128.0 -54.0 619.00 642.75 610.20 639.83 52040600.0 0.0 0.0 0.0
2020-12-15 1853.0 1016.0 23874.0 221 44.8529 148.0 -73.0 643.28 646.90 623.80 633.25 45223600.0 0.0 0.0 0.0
2020-12-16 2013.0 1031.0 22205.0 192 37.7729 125.0 -67.0 628.23 632.50 605.00 622.77 42095800.0 0.0 0.0 0.0
2020-12-17 1773.0 1383.0 22098.0 233 62.5091 169.0 -64.0 628.19 658.82 619.50 655.90 56270100.0 0.0 0.0 0.0
2020-12-18 925.0 428.0 9020.0 98 21.8282 67.0 -31.0 668.90 695.00 628.54 695.00 222126200.0 0.0 0.0 0.0

352 rows × 15 columns

df_combined.loc[df_combined['weekday'] == 1,['close','Volume']] = np.nan
def curve_function(df):
  """ This function creates our concave values in the Close column. """
  for i in df.columns:
      while df[i].isna().sum() > 0:
          for j in range(df.shape[0]):
              if pd.isnull(df.loc[j,i]):
                  seq_k = [j]
                  k = j
                  while pd.isnull(df.loc[k,i]):
                      k = k + 1
                      seq_k.append(k)
                  if len(seq_k) % 2 == 0:
                      df.loc[seq_k[int((len(seq_k) - 1)/2)],i] = (df.loc[j - 1,i] + df.loc[seq_k[len(seq_k) - 1],i])/2
                  else:
                      df.loc[seq_k[int((len(seq_k) - 1)/2)],i] = (df.loc[j - 1,i] + df.loc[seq_k[len(seq_k) - 1],i])/2
              else:
                  df.loc[j,i] = df.loc[j,i]
  return(df)
weekday = []
for i in range(df_combined.shape[0]):
    if pd.isna(df_combined.iloc[i]['close']):
        weekday.append(0)
    else:
        weekday.append(1)
weekday = []
for i in range(df_combined.shape[0]):
    if pd.isna(df_combined.iloc[i]['Volume']):
        weekday.append(0)
    else:
        weekday.append(1)
df_combined['date'] = df_combined.index

df_combined.reset_index(inplace=True)
df_net_curve = df_combined.pipe(curve_function)
 
df_net_curve.head()
index replies_count retweets_count likes_count tweet_count compound tweet_count_positive tweet_count_negative Open High Low close Volume Dividends Stock Splits weekday date
0 2020-01-02 454.0 821.0 6273.0 108 14.8880 68.0 -40.0 84.90 86.14 84.34 86.0500 47660500.0 0.0 0.0 0.0 2020-01-02
1 2020-01-03 1180.0 2209.0 18289.0 173 36.6589 118.0 -55.0 88.10 90.80 87.38 88.6000 88892500.0 0.0 0.0 0.0 2020-01-03
2 2020-01-04 325.0 571.0 3695.0 65 14.3277 42.0 -23.0 88.10 90.80 87.38 89.0275 79335625.0 0.0 0.0 1.0 2020-01-04
3 2020-01-05 386.0 959.0 6996.0 63 10.4158 40.0 -23.0 88.10 90.80 87.38 89.4550 69778750.0 0.0 0.0 1.0 2020-01-05
4 2020-01-06 390.0 665.0 5384.0 81 10.9749 47.0 -34.0 88.09 90.31 88.00 90.3100 50665000.0 0.0 0.0 0.0 2020-01-06
df_combined = pd.concat([df_net_curve,pd.Series(weekday,name='weekday')], axis=1)
df_combined.head()
index replies_count retweets_count likes_count tweet_count compound tweet_count_positive tweet_count_negative Open High Low close Volume Dividends Stock Splits weekday date weekday
0 2020-01-02 454.0 821.0 6273.0 108 14.8880 68.0 -40.0 84.90 86.14 84.34 86.0500 47660500.0 0.0 0.0 0.0 2020-01-02 1
1 2020-01-03 1180.0 2209.0 18289.0 173 36.6589 118.0 -55.0 88.10 90.80 87.38 88.6000 88892500.0 0.0 0.0 0.0 2020-01-03 1
2 2020-01-04 325.0 571.0 3695.0 65 14.3277 42.0 -23.0 88.10 90.80 87.38 89.0275 79335625.0 0.0 0.0 1.0 2020-01-04 0
3 2020-01-05 386.0 959.0 6996.0 63 10.4158 40.0 -23.0 88.10 90.80 87.38 89.4550 69778750.0 0.0 0.0 1.0 2020-01-05 0
4 2020-01-06 390.0 665.0 5384.0 81 10.9749 47.0 -34.0 88.09 90.31 88.00 90.3100 50665000.0 0.0 0.0 0.0 2020-01-06 1

We now have implemented a function that looks at a given day x and adds the next day y that NASDAQ has open and divides this by 2: (x+y)/2, which means that in the weekends, this function will add fridays stock price to mondays stockprice and divide these by two. Both saturday and sunday will be filled with this value.

Tweet sentiment vs. stock price

With the dataframes merged, tweets matched in time and handling of days where NASDAQ is closed, we can now begin to look into how tweet sentiments and tweets relate:

x7=df_combined.date
y7=df_combined['tweet_count_positive']
x8=df_combined.date
y8=-(df_combined['tweet_count_negative'])
x9= df_combined.date
y9= df_combined.tweet_count_positive + df_combined.tweet_count_negative 
x_t = df_combined.date
y_t = df_combined.close

#amount of tweets
fig, axs = plt.subplots(3,figsize=(14,8))
axs[0].plot(x7,y7)
axs[0].plot(x_t, y_t)
axs[1].plot(x8,y8)
axs[1].plot(x_t, y_t)
axs[2].plot(x9,y9)
axs[2].plot(x_t, y_t)
axs[0].set_title('Aktiepris (grøn) og antal af positive tweets (blå)')
axs[1].set_title('Aktiepris (grøn) og antal af negative tweets (blå)')
axs[2].set_title('Positive tweets minus negative')
idx = pd.date_range('2020-01-01', '2020-12-18')
s = pd.Series(np.random.randn(len(idx)), index=idx)
plt.tight_layout()

This plot indicates clear spikes in stockprice when many positive tweets occur. There are also minor falls in stock price, when the negative tweets are high and outweighs or almost outweighs the positive tweets.

Next we can look into a smaller period of time, namely a thirty day period

df_30 = df_combined.iloc[303:333]
x10 = df_30.date
y10 = df_30.tweet_count_positive
x11 = df_30.date
y11 = -df_30.tweet_count_negative
x12 = df_30.date
y12 = df_30.close
x13 = df_30.date
y13 = df_30.tweet_count_positive + df_30.tweet_count_negative
fig, axs = plt.subplots(3,figsize=(14,8))
axs[0].plot(x10,y10)
axs[0].plot(x12,y12)
axs[1].plot(x11,y11)
axs[1].plot(x12, y12)
axs[2].plot(x13,y13)
axs[2].plot(x12,y12)
axs[0].set_title('Aktiepris (grøn) og antal af positive tweets (blå)')
axs[1].set_title('Aktiepris (grøn) og antal af negative tweets (blå)')
axs[2].set_title('Positive minus negative tweets')
idx = pd.date_range('2020-01-01', '2020-12-18')
s = pd.Series(np.random.randn(len(idx)), index=idx)
plt.tight_layout()

Backtesting

We implement three trading strategies that we backtest.

1. Buy/hold strategy

This strategy is a simple buy/hold strategy where we buy on the 2nd of january 2020 and hold it until the last date of our dataframe which is 18th of december 2020.

We have 100.000 USD to invest and we will now calculate the return.

tesla.rename(columns={'Close': 'close'}, inplace=True)
buy_hold = tesla[['close']]
cash = 100000
buy_price = buy_hold['close'][0]
buy_hold['holdings'] = round(cash / buy_price)
buy_hold['value'] = buy_hold['close'] * buy_hold['holdings']
buy_hold['return'] = (buy_hold['value'].pct_change()[1:] + 1).cumprod()
buy_hold_value = buy_hold['value'][-1]
print('Our buy/hold strategy would give us a return of: ' + str(round(buy_hold['return'][-1] * 100)) + '%')
Our buy/hold strategy would give us a return of: 808.0%

2. Simple moving average strategy

First a basic strategy based of different moving averages.

tsla = tesla[['close']]
strat_1 = backtest('smac', tsla, fast_period=10, slow_period=60, commission=0)
strat_2 = backtest('smac', tsla, fast_period=15, slow_period=75, commission=0)
strat_3 = backtest('smac', tsla, fast_period=10, slow_period=30, commission=0)
strat_4 = backtest('smac', tsla, fast_period=5, slow_period=15, commission=0)
Starting Portfolio Value: 100000.00
2020-12-18, ===Global level arguments===
2020-12-18, init_cash : 100000
2020-12-18, buy_prop : 1
2020-12-18, sell_prop : 1
2020-12-18, commission : 0
2020-12-18, stop_loss : 0
2020-12-18, stop_trail : 0
===Strategy level arguments===
fast_period : 10
slow_period : 60
2020-12-18, Final Portfolio Value: 418444.93999999994
2020-12-18, Final PnL: 318444.94
Time used (seconds): 0.06559491157531738
==================================================
Number of strat runs: 1
Number of strats per run: 1
Strat names: ['smac']
**************************************************
--------------------------------------------------
Strategy Parameters	init_cash:100000	buy_prop:1	sell_prop:1	commission:0	stop_loss:0	stop_trail:0	execution_type:close	channel:	symbol:	allow_short:False	short_max:1.5	add_cash_amount:0	add_cash_freq:M	fast_period:10	slow_period:60
Returns	rtot:1.431375130172489	ravg:0.005842347470091792	rnorm:3.3591259298717717	rnorm100:335.91259298717716
Sharpe	sharperatio:None
Drawdown	len:0	drawdown:0.0	moneydown:0.0	max:AutoOrderedDict([('len', 59), ('drawdown', 33.721354502031325), ('moneydown', 112465.59000000003)])
Timedraw	maxdrawdown:33.721354502031325	maxdrawdownperiod:59
Optimal parameters:	init_cash:100000	buy_prop:1	sell_prop:1	commission:0	stop_loss:0	stop_trail:0	execution_type:close	channel:	symbol:	allow_short:False	short_max:1.5	add_cash_amount:0	add_cash_freq:M	fast_period:10	slow_period:60
Optimal metrics:	rtot:1.431375130172489	ravg:0.005842347470091792	rnorm:3.3591259298717717	rnorm100:335.91259298717716	len:0	drawdown:0.0	moneydown:0.0	max:AutoOrderedDict([('len', 59), ('drawdown', 33.721354502031325), ('moneydown', 112465.59000000003)])	maxdrawdown:33.721354502031325	maxdrawdownperiod:59	sharperatio:None	pnl:318444.94	final_value:418444.93999999994
Starting Portfolio Value: 100000.00
2020-12-18, ===Global level arguments===
2020-12-18, init_cash : 100000
2020-12-18, buy_prop : 1
2020-12-18, sell_prop : 1
2020-12-18, commission : 0
2020-12-18, stop_loss : 0
2020-12-18, stop_trail : 0
===Strategy level arguments===
fast_period : 15
slow_period : 75
2020-12-18, Final Portfolio Value: 491586.08999999997
2020-12-18, Final PnL: 391586.09
Time used (seconds): 0.08666706085205078
==================================================
Number of strat runs: 1
Number of strats per run: 1
Strat names: ['smac']
**************************************************
--------------------------------------------------
Strategy Parameters	init_cash:100000	buy_prop:1	sell_prop:1	commission:0	stop_loss:0	stop_trail:0	execution_type:close	channel:	symbol:	allow_short:False	short_max:1.5	add_cash_amount:0	add_cash_freq:M	fast_period:15	slow_period:75
Returns	rtot:1.592466895941384	ravg:0.006499864881393404	rnorm:4.1446942974276535	rnorm100:414.4694297427653
Sharpe	sharperatio:None
Drawdown	len:0	drawdown:0.0	moneydown:0.0	max:AutoOrderedDict([('len', 56), ('drawdown', 33.71419377566371), ('moneydown', 118853.76999999999)])
Timedraw	maxdrawdown:33.71419377566371	maxdrawdownperiod:56
Optimal parameters:	init_cash:100000	buy_prop:1	sell_prop:1	commission:0	stop_loss:0	stop_trail:0	execution_type:close	channel:	symbol:	allow_short:False	short_max:1.5	add_cash_amount:0	add_cash_freq:M	fast_period:15	slow_period:75
Optimal metrics:	rtot:1.592466895941384	ravg:0.006499864881393404	rnorm:4.1446942974276535	rnorm100:414.4694297427653	len:0	drawdown:0.0	moneydown:0.0	max:AutoOrderedDict([('len', 56), ('drawdown', 33.71419377566371), ('moneydown', 118853.76999999999)])	maxdrawdown:33.71419377566371	maxdrawdownperiod:56	sharperatio:None	pnl:391586.09	final_value:491586.08999999997
Starting Portfolio Value: 100000.00
2020-12-18, ===Global level arguments===
2020-12-18, init_cash : 100000
2020-12-18, buy_prop : 1
2020-12-18, sell_prop : 1
2020-12-18, commission : 0
2020-12-18, stop_loss : 0
2020-12-18, stop_trail : 0
===Strategy level arguments===
fast_period : 10
slow_period : 30
2020-12-18, Final Portfolio Value: 378797.44000000006
2020-12-18, Final PnL: 278797.44
Time used (seconds): 0.08386111259460449
==================================================
Number of strat runs: 1
Number of strats per run: 1
Strat names: ['smac']
**************************************************
--------------------------------------------------
Strategy Parameters	init_cash:100000	buy_prop:1	sell_prop:1	commission:0	stop_loss:0	stop_trail:0	execution_type:close	channel:	symbol:	allow_short:False	short_max:1.5	add_cash_amount:0	add_cash_freq:M	fast_period:10	slow_period:30
Returns	rtot:1.3318314171172572	ravg:0.005436046600478601	rnorm:2.9348932112596855	rnorm100:293.48932112596856
Sharpe	sharperatio:None
Drawdown	len:0	drawdown:0.0	moneydown:0.0	max:AutoOrderedDict([('len', 67), ('drawdown', 33.71986648970083), ('moneydown', 114987.24000000005)])
Timedraw	maxdrawdown:33.71986648970083	maxdrawdownperiod:67
Optimal parameters:	init_cash:100000	buy_prop:1	sell_prop:1	commission:0	stop_loss:0	stop_trail:0	execution_type:close	channel:	symbol:	allow_short:False	short_max:1.5	add_cash_amount:0	add_cash_freq:M	fast_period:10	slow_period:30
Optimal metrics:	rtot:1.3318314171172572	ravg:0.005436046600478601	rnorm:2.9348932112596855	rnorm100:293.48932112596856	len:0	drawdown:0.0	moneydown:0.0	max:AutoOrderedDict([('len', 67), ('drawdown', 33.71986648970083), ('moneydown', 114987.24000000005)])	maxdrawdown:33.71986648970083	maxdrawdownperiod:67	sharperatio:None	pnl:278797.44	final_value:378797.44000000006
Starting Portfolio Value: 100000.00
2020-12-18, ===Global level arguments===
2020-12-18, init_cash : 100000
2020-12-18, buy_prop : 1
2020-12-18, sell_prop : 1
2020-12-18, commission : 0
2020-12-18, stop_loss : 0
2020-12-18, stop_trail : 0
===Strategy level arguments===
fast_period : 5
slow_period : 15
2020-12-18, Final Portfolio Value: 472916.96
2020-12-18, Final PnL: 372916.96
Time used (seconds): 0.07521986961364746
==================================================
Number of strat runs: 1
Number of strats per run: 1
Strat names: ['smac']
**************************************************
--------------------------------------------------
Strategy Parameters	init_cash:100000	buy_prop:1	sell_prop:1	commission:0	stop_loss:0	stop_trail:0	execution_type:close	channel:	symbol:	allow_short:False	short_max:1.5	add_cash_amount:0	add_cash_freq:M	fast_period:5	slow_period:15
Returns	rtot:1.5537496268376367	ravg:0.006341835211582191	rnorm:3.9438405742099683	rnorm100:394.38405742099684
Sharpe	sharperatio:None
Drawdown	len:0	drawdown:0.0	moneydown:0.0	max:AutoOrderedDict([('len', 76), ('drawdown', 34.35735923043108), ('moneydown', 157340.20000000007)])
Timedraw	maxdrawdown:34.35735923043108	maxdrawdownperiod:76
Optimal parameters:	init_cash:100000	buy_prop:1	sell_prop:1	commission:0	stop_loss:0	stop_trail:0	execution_type:close	channel:	symbol:	allow_short:False	short_max:1.5	add_cash_amount:0	add_cash_freq:M	fast_period:5	slow_period:15
Optimal metrics:	rtot:1.5537496268376367	ravg:0.006341835211582191	rnorm:3.9438405742099683	rnorm100:394.38405742099684	len:0	drawdown:0.0	moneydown:0.0	max:AutoOrderedDict([('len', 76), ('drawdown', 34.35735923043108), ('moneydown', 157340.20000000007)])	maxdrawdown:34.35735923043108	maxdrawdownperiod:76	sharperatio:None	pnl:372916.96	final_value:472916.96
print('Strategy 1 would give us a return of: ' + str(round(strat_1['final_value'] / strat_1['init_cash'] *100)) + '%')
print('Strategy 2 would give us a return of: ' + str(round(strat_2['final_value'] / strat_2['init_cash'] *100)) + '%')
print('Strategy 3 would give us a return of: ' + str(round(strat_3['final_value'] / strat_3['init_cash'] *100)) + '%')
print('Strategy 4 would give us a return of: ' + str(round(strat_4['final_value'] / strat_4['init_cash'] *100)) + '%')
Strategy 1 would give us a return of: 0    418.0
dtype: float64%
Strategy 2 would give us a return of: 0    492.0
dtype: float64%
Strategy 3 would give us a return of: 0    379.0
dtype: float64%
Strategy 4 would give us a return of: 0    473.0
dtype: float64%

3. Strategy based off twitterdata

We have 3 hypothesis as to to what data fram twitter we can use for our trading strategy.

  1. The first hypothethis is that we can use the compound score (sentiment) to determine wether we should buy or sell the stock. The logic here is the same as in the moving average. We try to smooth out the sentiment for the tesla data by creating a constantly updated average sentiment. The logic is that we can see when the sentiment shifts from being positive to negatve and vice verca and this will create a buy/sell signal.

  2. The second hypothethis is that we use the amount of negative and positive tweet as we think this could create a buy/sell signall aswell.

  3. The third hypothethis is that we use the total amount of tweets pr. day to give us buy/sell signals.

comp_short_window = 5
comp_long_window = 25

comp_short = pd.DataFrame()
comp_short['score'] = df_combined['compound'].rolling(window= comp_short_window).mean() #using 5 days
comp_short
score
0 NaN
1 NaN
2 NaN
3 NaN
4 17.45306
... ...
347 37.60390
348 35.10000
349 37.20410
350 43.93874
351 42.55422

352 rows × 1 columns

comp_long = pd.DataFrame()
comp_long['score'] = df_combined['compound'].rolling(window=comp_long_window).mean() # using 20 days
comp_long
score
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
... ...
347 37.961352
348 38.916684
349 39.606428
350 41.444708
351 40.096776

352 rows × 1 columns

plt.figure(figsize=(12.5 , 4.5))
plt.plot(df_combined['close'], label ='stockprice')
plt.plot(comp_short['score']*5, label = 'compound shortterm')
plt.plot(comp_long['score']*5, label = 'compound longterm')
plt.title('price history')
plt.xlabel('jan. 01 - dec 18')
plt.ylabel('Close price $')
plt.legend(loc='upper left')
plt.show()
strategy_1 = pd.DataFrame()
strategy_1['short'] = comp_short['score']
strategy_1['long'] = comp_long['score']
strategy_1['tsla'] = df_combined['close']
strategy_1
short long tsla
0 NaN NaN 86.0500
1 NaN NaN 88.6000
2 NaN NaN 89.0275
3 NaN NaN 89.4550
4 17.45306 NaN 90.3100
... ... ... ...
347 37.60390 37.961352 639.8300
348 35.10000 38.916684 633.2500
349 37.20410 39.606428 622.7700
350 43.93874 41.444708 655.9000
351 42.55422 40.096776 695.0000

352 rows × 3 columns

def buy_sell(df):
  
  """Function that creats buy and sell signals when moving average crosses eachother."""
  sigPriceBuy = []
  sigPriceSell = []
  flag = -1

  for i in range(len(df)):
    if df['short'][i] > df['long'][i]:
      if flag != 1:
        sigPriceBuy.append(df['tsla'][i])
        sigPriceSell.append(np.nan)
        flag = 1
      else:
        sigPriceBuy.append(np.nan)
        sigPriceSell.append(np.nan)
    elif df['short'][i] < df['long'][i]:
      if flag != 0:
        sigPriceBuy.append(np.nan)
        sigPriceSell.append(df['tsla'][i])
        flag = 0
      else:
        sigPriceBuy.append(np.nan)
        sigPriceSell.append(np.nan)
    else:
      sigPriceBuy.append(np.nan)
      sigPriceSell.append(np.nan)

  return (sigPriceBuy, sigPriceSell)
buy_sell = buy_sell(strategy_1)
strategy_1['Buy_Signal_Price'] = buy_sell[0]
strategy_1['Sell_Signal_Price'] = buy_sell[1]
plt.figure(figsize=(12.6, 4.6))
plt.plot(strategy_1['tsla'], label ='tesla')
plt.plot(comp_short['score'], label = 'comp_short')
plt.plot(comp_long['score'], label = 'comp_long')
plt.scatter(strategy_1.index, strategy_1['Buy_Signal_Price'], label = 'Køb', marker = '^', color='green')
plt.scatter(strategy_1.index, strategy_1['Sell_Signal_Price'], label = 'Salg', marker = 'v', color='red')
plt.title('Køb og salg baseret på Compound')
plt.xlabel('01-01-2020 til 18-12-2020 (i antal dage)')
plt.ylabel('Close price $')
plt.legend(loc='upper left')
plt.show()
# Initialize the `signals` DataFrame with the `signal` column
signals = pd.DataFrame(index=strategy_1.index)
signals['signal'] = 1.0
signals['short'] = strategy_1['short']
signals['long'] = strategy_1['long']
signals['signal'][comp_short_window:] = np.where(signals['short'][comp_short_window:] 
                                            > signals['long'][comp_short_window:], 1.0, 0.0) 
signals['positions'] = signals['signal'].diff()
initial_capital= float(0)

# Create a DataFrame `positions`
positions = pd.DataFrame(index=signals.index).fillna(0.0)
# Buy buy 100 shares
positions['TSLA'] = 100*signals['signal']
portfolio = positions.multiply(strategy_1['tsla'], axis=0)

# Store the difference in shares owned 
pos_diff = positions.diff()

# Add `holdings` to portfolio
portfolio['holdings'] = (positions.multiply(strategy_1['tsla'], axis=0)).sum(axis=1)

# Add `cash` to portfolio
portfolio['cash'] = initial_capital - (pos_diff.multiply(strategy_1['tsla'], axis=0)).sum(axis=1).cumsum()  

# Add `total` to portfolio
portfolio['total'] = portfolio['cash'] + portfolio['holdings']

# Add `returns` to portfolio
portfolio['returns'] = portfolio['total'].pct_change()
portfolio['cumulative_ret'] = (portfolio['returns'] + 1).cumprod() 
# Create a figure
fig = plt.figure()

ax1 = fig.add_subplot(111, ylabel='Portfolio value in $')

# Plot the equity curve in dollars
portfolio['total'].plot(ax=ax1, lw=2.)

ax1.plot(portfolio.loc[signals.positions == 1.0].index, 
         portfolio.total[signals.positions == 1.0],
         '^', markersize=10, color='m')
ax1.plot(portfolio.loc[signals.positions == -1.0].index, 
         portfolio.total[signals.positions == -1.0],
         'v', markersize=10, color='k')

# Show the plot
plt.show()
print('strategy based on twitter compound score would give us a return of:' + str(round(portfolio['cumulative_ret'][351] * 100)) + '%')
strategy based on twitter compound score would give us a return of:683.0%

2. Buy sell based on negative and positive tweets

strategy_2_short = 3
strategy_2_long = 20

#Storing the averages in a df
strategy_2 = pd.DataFrame()
strategy_2['long'] = df_combined['tweet_count_positive'].rolling(window= strategy_2_long).mean() 
strategy_2['short'] = -df_combined['tweet_count_negative'].rolling(window=strategy_2_short).mean() # Describe why positive is long and not short
strategy_2['tsla'] = df_combined['close']
strategy_2.head()
long short tsla
0 NaN NaN 86.0500
1 NaN NaN 88.6000
2 NaN 39.333333 89.0275
3 NaN 33.666667 89.4550
4 NaN 26.666667 90.3100
def buy_sell_2(df):
  
  """Function that creats buy and sell signals when moving average crosses eachother."""
  sigPriceBuy = []
  sigPriceSell = []
  flag = -1

  for i in range(len(df)):
    if df['short'][i] > df['long'][i]:
      if flag != 1:
        sigPriceBuy.append(df['tsla'][i])
        sigPriceSell.append(np.nan)
        flag = 1
      else:
        sigPriceBuy.append(np.nan)
        sigPriceSell.append(np.nan)
    elif df['short'][i] < df['long'][i]:
      if flag != 0:
        sigPriceBuy.append(np.nan)
        sigPriceSell.append(df['tsla'][i])
        flag = 0
      else:
        sigPriceBuy.append(np.nan)
        sigPriceSell.append(np.nan)
    else:
      sigPriceBuy.append(np.nan)
      sigPriceSell.append(np.nan)

  return (sigPriceBuy, sigPriceSell)
buy_sell_2 = buy_sell_2(strategy_2)
strategy_2['buy_signal_price'] = buy_sell_2[0]
strategy_2['sell_signal_price'] = buy_sell_2[1]
plt.figure(figsize=(12.6, 4.6))
plt.plot(strategy_2['tsla'], label ='tesla')
plt.plot(strategy_2['short'], label = 'short')
plt.plot(strategy_2['long'], label = 'long')
plt.scatter(strategy_2.index, strategy_2['buy_signal_price'], label = 'Køb', marker = '^', color='green')
plt.scatter(strategy_2.index, strategy_2['sell_signal_price'], label = 'Salg', marker = 'v', color='red')
plt.title('Køb og salg baseret på antal positive/negative tweets')
plt.xlabel('01-01-2020 til 18-12-2020 (i antal dage)')
plt.ylabel('Close price $')
plt.legend(loc='upper left')
plt.show()
# Initialize the `signals` DataFrame with the `signal` column
signals_2 = pd.DataFrame(index=strategy_2.index)
signals_2['signal'] = 1.0

# Create short & long simple moving average over the comp score
signals_2['short'] = strategy_2['short']
signals_2['long'] = strategy_2['long']

# Create signal
signals_2['signal'][strategy_2_short:] = np.where(signals_2['short'][strategy_2_short:] 
                                            < signals_2['long'][strategy_2_short:], 1.0, 0.0) 
# Generate trading orders
signals_2['positions'] = signals_2['signal'].diff()
positions_2 = pd.DataFrame(index=signals_2.index).fillna(0.0)
# Buy buy 100 shares
positions_2['TSLA'] = 100*signals_2['signal']
portfolio_2 = positions_2.multiply(strategy_2['tsla'], axis=0)

# Store the difference in shares owned 
pos_diff_2 = positions_2.diff()

# Add `holdings` to portfolio
portfolio_2['holdings'] = (positions_2.multiply(strategy_2['tsla'], axis=0)).sum(axis=1)

# Add `cash` to portfolio
portfolio_2['cash'] = initial_capital - (pos_diff_2.multiply(strategy_2['tsla'], axis=0)).sum(axis=1).cumsum()  

# Add `total` to portfolio
portfolio_2['total'] = portfolio_2['cash'] + portfolio_2['holdings']

# Add `returns` to portfolio
portfolio_2['returns'] = portfolio_2['total'].pct_change()

portfolio_2['cumulative_ret'] = (portfolio_2['returns'] + 1).cumprod() 
fig = plt.figure()

ax1 = fig.add_subplot(111, ylabel='Portfolio value in $')

# Plot the equity curve in dollars
portfolio_2['total'].plot(ax=ax1, lw=2.)

ax1.plot(portfolio_2.loc[signals_2.positions == 1.0].index, 
         portfolio_2.total[signals_2.positions == 1.0],
         '^', markersize=10, color='m')
ax1.plot(portfolio_2.loc[signals_2.positions == -1.0].index, 
         portfolio_2.total[signals_2.positions == -1.0],
         'v', markersize=10, color='k')

# Show the plot
plt.show()
print('strategy based on negative and positive tweets score would give us a return of:' + str(round(portfolio_2['cumulative_ret'][351] * 100)) + '%')
strategy based on negative and positive tweets score would give us a return of:817.0%
strategy_3_short = 5
strategy_3_long = 30

#Storing the averages in a df
strategy_3 = pd.DataFrame()
strategy_3['short'] = df_combined['tweet_count'].rolling(window= strategy_3_short).mean() 
strategy_3['long'] = df_combined['tweet_count'].rolling(window=strategy_3_long).mean() 
strategy_3['tsla'] = df_combined['close']
strategy_3.head()
short long tsla
0 NaN NaN 86.0500
1 NaN NaN 88.6000
2 NaN NaN 89.0275
3 NaN NaN 89.4550
4 98.0 NaN 90.3100
def buy_sell_3(df):
  
  """Function that creats buy and sell signals when moving average crosses eachother."""
  sigPriceBuy = []
  sigPriceSell = []
  flag = -1

  for i in range(len(df)):
    if df['short'][i] > df['long'][i]:
      if flag != 1:
        sigPriceBuy.append(df['tsla'][i])
        sigPriceSell.append(np.nan)
        flag = 1
      else:
        sigPriceBuy.append(np.nan)
        sigPriceSell.append(np.nan)
    elif df['short'][i] < df['long'][i]:
      if flag != 0:
        sigPriceBuy.append(np.nan)
        sigPriceSell.append(df['tsla'][i])
        flag = 0
      else:
        sigPriceBuy.append(np.nan)
        sigPriceSell.append(np.nan)
    else:
      sigPriceBuy.append(np.nan)
      sigPriceSell.append(np.nan)

  return (sigPriceBuy, sigPriceSell)
buy_sell_3 = buy_sell_3(strategy_3)
strategy_3['buy_signal_price'] = buy_sell_3[0]
strategy_3['sell_signal_price'] = buy_sell_3[1]

plt.figure(figsize=(12.6, 4.6))
plt.plot(strategy_3['tsla'], label ='tesla')
plt.plot(strategy_3['short'], label = 'short')
plt.plot(strategy_3['long'], label = 'long')
plt.scatter(strategy_3.index, strategy_3['buy_signal_price'], label = 'Køb', marker = '^', color='green')
plt.scatter(strategy_3.index, strategy_3['sell_signal_price'], label = 'Salg', marker = 'v', color='red')
plt.title('Køb og salg baseret på antal tweets')
plt.xlabel('01-01-2020 til 18-12-2020 (i antal dage)')
plt.ylabel('Close price $')
plt.legend(loc='upper left')
plt.show()
signals_3 = pd.DataFrame(index=strategy_2.index)
signals_3['signal'] = 1.0

# Create short & long simple moving average over the comp score
signals_3['short'] = strategy_3['short']
signals_3['long'] = strategy_3['long']

# Create signal
signals_3['signal'][strategy_3_short:] = np.where(signals_3['short'][strategy_3_short:] 
                                            > signals_3['long'][strategy_3_short:], 1.0, 0.0) 
# Generate trading orders
signals_3['positions'] = signals_3['signal'].diff()
positions_3 = pd.DataFrame(index=signals_3.index).fillna(0.0)
# Buy buy 100 shares
positions_3['TSLA'] = 100*signals_3['signal']
portfolio_3 = positions_3.multiply(strategy_3['tsla'], axis=0)

# Store the difference in shares owned 
pos_diff_3 = positions_3.diff()

# Add `holdings` to portfolio
portfolio_3['holdings'] = (positions_3.multiply(strategy_3['tsla'], axis=0)).sum(axis=1)

# Add `cash` to portfolio
portfolio_3['cash'] = initial_capital - (pos_diff_3.multiply(strategy_3['tsla'], axis=0)).sum(axis=1).cumsum()  

# Add `total` to portfolio
portfolio_3['total'] = portfolio_3['cash'] + portfolio_3['holdings']

# Add `returns` to portfolio
portfolio_3['returns'] = portfolio_3['total'].pct_change()

portfolio_3['cumulative_ret'] = (portfolio_3['returns'] + 1).cumprod() 
fig = plt.figure()

ax1 = fig.add_subplot(111, ylabel='Portfolio value in $')

# Plot the equity curve in dollars
portfolio_3['total'].plot(ax=ax1, lw=2.)

ax1.plot(portfolio_3.loc[signals_3.positions == 1.0].index, 
         portfolio_3.total[signals_3.positions == 1.0],
         '^', markersize=10, color='m')
ax1.plot(portfolio_3.loc[signals_3.positions == -1.0].index, 
         portfolio_3.total[signals_3.positions == -1.0],
         'v', markersize=10, color='k')

# Show the plot
plt.show()
print('strategy based on negative and positive tweets score would give us a return of:' + str(round(portfolio_3['cumulative_ret'][351] * 100)) + '%')
strategy based on negative and positive tweets score would give us a return of:648.0%

Neural net

In order to see whether or not positive and negative tweets counts can be used for forecasting, we implement two LSTM's, one that predicts based upon closing price, volume, high and low price. Another is trained on the same stock data, but on top of that the positive and negative tweet count is included, in order to see if it provides any improvement to the RMSE-score. The used algorithm is LSTM, which stands for Long-short term memory, which is demonstrated by Jin, Yang & Liu (2019) to be a viable choice when predicting stock prices based upon stock data and sentiment analysis. The implemented LSTM's are very basic in structure, and only serve to demonstrate if incorporation of our twitter sentiment data improves the model performance. This may indicate that twitter sentiments can be viable for stock forecasting, even though our sentiment classification is far from perfect.

We forecast one day ahead, based upon five days of prior data, and run this as a loop over the entire dataframe. This is chosen as the performance improved, when we shortened the data we used as input to forecast on. 1 day ahead follows what Jin, Yang and Liu (2019) did, and is a logical choice as one day is the nearest, we can forecast ahead in time, and therefore reduces the uncertainty compared to forecasting 2 or more days ahead.

The models are trained using a batch_size = 1, as the data it is given is minimal. To keep things equal the models are given "Closing price", "High" and "Low". "Volume" and other stock data generally decreased performance. On top of model 1 is given, model 2 is given tweet_count_negative and tweet_count_positive. Both models are run for 30 epochs, to keep things the same. While overfitting is technically a thing, what is crucial here is how well the model predicts one day ahead for data it has not seen before, which is what we asks the models to do for 19-12-2020.

Model 1- only stock data

df_combined.set_index('index', inplace=True)
df_net = df_combined[['close', 'High', 'Low']]
close_scaler = MinMaxScaler(feature_range=(-1,1))
df_scaler = MinMaxScaler(feature_range=(-1,1))
close_scaler.fit(df_net[['close']])
#fitting and creating a new dataframe with the scaled values.
df = pd.DataFrame(df_scaler.fit_transform(df_net), columns=df_net.columns, index=df_net.index)
df.dropna(inplace=True)
#input days to predict on
n_input = 5
#days to output:
n_output = 1
#number of features to use:
n_features=df.shape[1]
#predict 10 days out in the future.

def split_sequence(values, n_input, n_output):

  #creating a list for X and y
  X, y = [], []

  for i in range(len(values)):

        
        # the index + n_input (90) defines the end of the sequence
        end = i + n_input
        # the out_end defines the output (10) we want to predict 
        out = end + n_output
      
        #Stopping the loop, if we don't have enough data to predict on
        if out > len(values):
            break
        #creating two sequences, where x_val contains past prices and variables,
        #and y_val contains the prices we want to predict
        x_val= values[i:end, :]
        y_val = values[end:out, 0]

        
        X.append(x_val)
        y.append(y_val)
    
  return np.array(X), np.array(y)
def validater(n_input, n_output):
 
 #first we create an empty df to store our predictions in:
  predictions = pd.DataFrame(index=df.index, columns=[df.columns[0]])
 
  for i in range(n_input, len(df)-n_input, n_output):
    # Rolling intervals:
    a = df[-i - n_input:-i]
 
    # Predicting using rolling intervals
    pred_y = model.predict(np.array(a).reshape(1, n_input, n_features))
 
    # Transforming values back to their normal prices
    pred_y = close_scaler.inverse_transform(pred_y)[0]
 
    # DF to store the values and append later, frequency uses business days
    pred_df = pd.DataFrame(pred_y, 
                           index = pd.date_range(start=a.index[-1], 
                                                       periods=len(pred_y)),
                           columns = [a.columns[0]])
    # Updating the predictions DF
    predictions.update(pred_df)
 
  return predictions
def val_rmse(df1, df2):
  df=df1.copy()

  df['close2'] = df2.close

  df.dropna(inplace=True)

  df['diff'] = df.close - df.close2

  rms = (df[['diff']]**2).mean()

  return float(np.sqrt(rms)) 
X, y = split_sequence(df.to_numpy(), n_input, n_output)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model = Sequential()
model.add(LSTM(128, activation='tanh', return_sequences=True, input_shape=(n_input, n_features)))
model.add(LSTM(128, activation='tanh', return_sequences=True))
model.add(LSTM(64, activation='tanh'))
model.add(Dense(n_output))
model.compile(optimizer='adam', loss='mse')
model.summary()

model.fit(X,y, epochs=30, batch_size=1, validation_split=0.1)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm (LSTM)                  (None, 5, 128)            67584     
_________________________________________________________________
lstm_1 (LSTM)                (None, 5, 128)            131584    
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
=================================================================
Total params: 248,641
Trainable params: 248,641
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
312/312 [==============================] - 12s 8ms/step - loss: 0.0458 - val_loss: 0.0578
Epoch 2/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0075 - val_loss: 0.0742
Epoch 3/30
312/312 [==============================] - 2s 6ms/step - loss: 0.0052 - val_loss: 0.0633
Epoch 4/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0047 - val_loss: 0.0765
Epoch 5/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0050 - val_loss: 0.0549
Epoch 6/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0042 - val_loss: 0.0273
Epoch 7/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0037 - val_loss: 0.0365
Epoch 8/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0039 - val_loss: 0.0661
Epoch 9/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0041 - val_loss: 0.0676
Epoch 10/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0025 - val_loss: 0.0326
Epoch 11/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0028 - val_loss: 0.0483
Epoch 12/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0030 - val_loss: 0.0457
Epoch 13/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0027 - val_loss: 0.0634
Epoch 14/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0016 - val_loss: 0.0375
Epoch 15/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0026 - val_loss: 0.0411
Epoch 16/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0021 - val_loss: 0.0205
Epoch 17/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0020 - val_loss: 0.0154
Epoch 18/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0026 - val_loss: 0.0751
Epoch 19/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0016 - val_loss: 0.0731
Epoch 20/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0037 - val_loss: 0.0700
Epoch 21/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0023 - val_loss: 0.0429
Epoch 22/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0018 - val_loss: 0.0392
Epoch 23/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0016 - val_loss: 0.0416
Epoch 24/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0015 - val_loss: 0.0290
Epoch 25/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0020 - val_loss: 0.0428
Epoch 26/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0014 - val_loss: 0.0402
Epoch 27/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0018 - val_loss: 0.0282
Epoch 28/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0018 - val_loss: 0.0306
Epoch 29/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0014 - val_loss: 0.0700
Epoch 30/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0016 - val_loss: 0.0567
<tensorflow.python.keras.callbacks.History at 0x7ffa1da1ec88>
loss_per_epoch = model.history.history['loss']
plt.plot(range(len(loss_per_epoch)),loss_per_epoch);
actual = pd.DataFrame(close_scaler.inverse_transform(df[['close']]), index=df.index, columns=[df.columns[0]])
predictions = validater(n_input, n_output)
 
# Printing the RMSE
print("RMSE:", val_rmse(actual, predictions))
    
# Plotting
plt.figure(figsize=(16,6))
 
# Plotting those predictions
plt.plot(predictions, label='Predicted')
 
# Plotting the actual values
plt.plot(actual, label='Actual')
 
plt.title("Predicted vs Actual Closing Prices")
plt.ylabel("Price")
plt.legend()
plt.show()
RMSE: 20.59045685690608
tesla_now
Open High Low Close Volume Dividends Stock Splits
Date
2020-12-18 668.90 695.00 628.54 695.00 222126200 0 0
2020-12-21 666.24 668.50 646.07 649.86 58045300 0 0
2020-12-22 648.00 649.88 614.23 640.34 51716000 0 0
2020-12-23 632.20 651.50 622.57 645.98 33173000 0 0
2020-12-24 642.99 666.09 641.00 661.77 22865600 0 0
2020-12-28 674.51 681.40 660.80 663.69 32278600 0 0
2020-12-29 661.00 669.90 655.00 665.99 22910800 0 0
2020-12-30 672.00 696.60 668.36 694.78 42846000 0 0
2020-12-31 699.99 718.72 691.12 705.67 49570900 0 0
periods= 10
pred_y_10 = model.predict(np.array(df.tail(n_input)).reshape(1, n_input, n_features))
#inverse transform to get the unscaled value:
pred_y_10 = close_scaler.inverse_transform(pred_y_10)[0]
df
close High Low
index
2020-01-02 -0.955649 -0.983160 -0.949001
2020-01-03 -0.947460 -0.967982 -0.938113
2020-01-04 -0.946087 -0.967982 -0.938113
2020-01-05 -0.944714 -0.967982 -0.938113
2020-01-06 -0.941968 -0.969578 -0.935893
... ... ... ...
2020-12-14 0.822821 0.829813 0.934317
2020-12-15 0.801689 0.843330 0.983024
2020-12-16 0.768033 0.796427 0.915694
2020-12-17 0.874430 0.882156 0.967624
2020-12-18 1.000000 1.000000 1.000000

352 rows × 3 columns

preds = pd.DataFrame(pred_y_10, index=pd.date_range(start=df.index[-1]+timedelta(days=1),periods=len(pred_y_10)), columns=[df.columns[0]])
preds
close
2020-12-19 560.678284

This model does not perform very well. Given the stock price on the 19th is a weekend, the model should have approximated (friday_price+monday_price)/2, given our previous assumptions. This price is roughly (695+650)/2 = 672,5.

Model 2 - twitter data

df_net = df_combined[['close', 'High', 'Low', 'tweet_count_positive', 'tweet_count_negative']]
close_scaler.fit(df_net[['close']])
df = pd.DataFrame(df_scaler.fit_transform(df_net), columns=df_net.columns, index=df_net.index)
close_scaler = MinMaxScaler(feature_range=(-1,1))
df_scaler = MinMaxScaler(feature_range=(-1,1))
close_scaler.fit(df_net[['close']])
#fitting and creating a new dataframe with the scaled values.
df = pd.DataFrame(df_scaler.fit_transform(df_net), columns=df_net.columns, index=df_net.index)
df.dropna(inplace=True)
#input days to predict on
n_input = 5
#days to output:
n_output = 1
#number of features to use:
n_features=df.shape[1]
X, y = split_sequence(df.to_numpy(), n_input, n_output)
model = Sequential()
model.add(LSTM(128, activation='tanh', return_sequences=True, input_shape=(n_input, n_features)))
model.add(LSTM(128, activation='tanh', return_sequences=True))
model.add(LSTM(64, activation='tanh'))
model.add(Dense(n_output))
model.compile(optimizer='adam', loss='mse')
model.summary()

model.fit(X,y, epochs=30, batch_size=1, validation_split=0.1)
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_3 (LSTM)                (None, 5, 128)            68608     
_________________________________________________________________
lstm_4 (LSTM)                (None, 5, 128)            131584    
_________________________________________________________________
lstm_5 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
=================================================================
Total params: 249,665
Trainable params: 249,665
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
312/312 [==============================] - 5s 7ms/step - loss: 0.0407 - val_loss: 0.0938
Epoch 2/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0061 - val_loss: 0.0933
Epoch 3/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0064 - val_loss: 0.1130
Epoch 4/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0040 - val_loss: 0.1400
Epoch 5/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0051 - val_loss: 0.1151
Epoch 6/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0043 - val_loss: 0.0995
Epoch 7/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0037 - val_loss: 0.0449
Epoch 8/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0043 - val_loss: 0.0870
Epoch 9/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0035 - val_loss: 0.1937
Epoch 10/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0037 - val_loss: 0.0642
Epoch 11/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0026 - val_loss: 0.0536
Epoch 12/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0028 - val_loss: 0.0254
Epoch 13/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0025 - val_loss: 0.0297
Epoch 14/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0017 - val_loss: 0.0462
Epoch 15/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0021 - val_loss: 0.0279
Epoch 16/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0017 - val_loss: 0.0574
Epoch 17/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0022 - val_loss: 0.0240
Epoch 18/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0030 - val_loss: 0.0244
Epoch 19/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0020 - val_loss: 0.0318
Epoch 20/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0020 - val_loss: 0.0331
Epoch 21/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0021 - val_loss: 0.0451
Epoch 22/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0019 - val_loss: 0.0393
Epoch 23/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0020 - val_loss: 0.0285
Epoch 24/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0019 - val_loss: 0.0682
Epoch 25/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0016 - val_loss: 0.0470
Epoch 26/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0017 - val_loss: 0.0332
Epoch 27/30
312/312 [==============================] - 2s 6ms/step - loss: 0.0018 - val_loss: 0.0717
Epoch 28/30
312/312 [==============================] - 1s 5ms/step - loss: 0.0021 - val_loss: 0.0493
Epoch 29/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0017 - val_loss: 0.0484
Epoch 30/30
312/312 [==============================] - 1s 4ms/step - loss: 0.0021 - val_loss: 0.0829
<tensorflow.python.keras.callbacks.History at 0x7ffa1007a550>
loss_per_epoch = model.history.history['loss']
plt.plot(range(len(loss_per_epoch)),loss_per_epoch);
actual = pd.DataFrame(close_scaler.inverse_transform(df[['close']]), index=df.index, columns=[df.columns[0]])
predictions = validater(n_input, n_output)
 
# Printing the RMSE
print("RMSE:", val_rmse(actual, predictions))
    
# Plotting
plt.figure(figsize=(16,6))
 
# Plotting those predictions
plt.plot(predictions, label='Predicted')
 
# Plotting the actual values
plt.plot(actual, label='Actual')
 
plt.title("Predicted vs Actual Closing Prices")
plt.ylabel("Price")
plt.legend()
plt.show()
RMSE: 24.829738567443563
pred_y_10 = model.predict(np.array(df.tail(n_input)).reshape(1, n_input, n_features))
#inverse transform to get the unscaled value:
pred_y_10 = close_scaler.inverse_transform(pred_y_10)[0]
preds = pd.DataFrame(pred_y_10, index=pd.date_range(start=df.index[-1]+timedelta(days=1),periods=len(pred_y_10)), columns=[df.columns[0]])
preds
close
2020-12-19 533.673218